In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
creditData = pd.read_csv('GermanCredit.csv')

#3 columns with most 'none' values
def dropNoneColumns(df, n=3):
    noneCounts = df.apply(lambda col: (col == 'none').sum()).sort_values(ascending=False)
    return df.drop(columns=noneCounts[noneCounts > 0].head(n).index.tolist())
    print(noneCounts)

creditData = dropNoneColumns(creditData)
print("Shape after dropping least useful columns:", creditData.shape)


# Remove apostrophes 
for col in creditData.select_dtypes(include='object').columns:
    creditData[col] = creditData[col].str.replace("'", "").str.strip()
    
print(creditData.head())

# changing the columns
creditData['checkingStatus'] = creditData['checking_status'].map({
    'no checking': 'No Checking',
    '<0': 'Low',
    '0<=X<200': 'Medium',
    '>=200': 'High'
})

creditData['savingsStatus'] = creditData['savings_status'].map({
    'no known savings': 'No Savings',
    '<100': 'Low',
    '100<=X<500': 'Medium',
    '500<=X<1000': 'High',
    '>=1000': 'High'
})

creditData['creditClass'] = creditData['class'].map({'good': 1, 'bad': 0})

print("\nUnique checking status values:", creditData['checkingStatus'].unique())
print("Unique savings status values:", creditData['savingsStatus'].unique())

creditData['creditClass'] = creditData['class'].map({'good': 1, 'bad': 0})
print("\nClass value counts:")
print(creditData['creditClass'].value_counts())

def mapEmployment(val):
    if val == 'unemployed': return 'Unemployed'
    elif val == '<1': return 'Amateur'
    elif val == '1<=X<4': return 'Professional'
    elif val == '4<=X<7': return 'Experienced'
    elif val == '>=7': return 'Expert'
    return val

creditData['employmentLevel'] = creditData['employment'].apply(mapEmployment)
print("\nUnique employment levels:")
print(creditData['employmentLevel'].unique())

# ANALYSIS 
# 1.  foreignworker vs class
print("\nForeign Worker vs Credit Class:")
print(pd.crosstab(creditData['foreign_worker'], creditData['creditClass']))

# 2. employment vs savingsstatus
print("\nEmployment vs Savings Status:")
print(pd.crosstab(creditData['employmentLevel'], creditData['savingsStatus']))

# 3. Avg credit amount for single males 
avgCreditAmount = creditData[(creditData['personal_status'] == 'male single') & (creditData['employmentLevel'] == 'Experienced')]['credit_amount'].mean()
print("\nAverage credit amount (single males, Experienced):", avgCreditAmount)

# 4. Avg duration per job type
print("\nAverage credit duration by job:")
print(creditData.groupby('job')['duration'].mean())

# 5. Most common checking and savings status for purpose education
eduSubset = creditData[creditData['purpose'] == 'education']
mostCommonChecking = eduSubset['checkingStatus'].mode()[0]
mostCommonSaving = eduSubset['savingsStatus'].mode()[0]
print("\nMost common checking status (education):", mostCommonChecking)
print("Most common savings status (education):", mostCommonSaving)

# visu
# 1. Bar chart subplots: savingsStatus and checkingStatus vs personalstatus
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
creditData.groupby(['savingsStatus', 'personal_status']).size().unstack().plot(kind='bar', stacked=False, ax=plt.gca())
plt.title('Savings Status vs Personal Status')
plt.xlabel('Savings Status')
plt.ylabel('Count')
plt.grid(True)

plt.subplot(1, 2, 2)
creditData.groupby(['checkingStatus', 'personal_status']).size().unstack().plot(kind='bar', stacked=False, ax=plt.gca())
plt.title('Checking Status vs Personal Status')
plt.xlabel('Checking Status')
plt.ylabel('Count')
plt.grid(True)

plt.tight_layout()
plt.show()

# 2. bar plot 
highCredit = creditData[creditData['credit_amount'] > 4000]
highCredit.groupby('property_magnitude')['age'].mean().plot(kind='bar', figsize=(8, 6))
plt.title('Average Age vs Property Magnitude (Credit > 4000)')
plt.xlabel('Property Magnitude')
plt.ylabel('Average Age')
plt.grid(True)
plt.tight_layout()
plt.show()

# 3.pie chart 
highSavingsOlder = creditData[(creditData['savingsStatus'] == 'High') & (creditData['age'] > 40)]
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

highSavingsOlder['personal_status'].value_counts().plot(kind='pie', ax=axs[0], autopct='%1.1f%%', title='Personal Status')
highSavingsOlder['credit_history'].value_counts().plot(kind='pie', ax=axs[1], autopct='%1.1f%%', title='Credit History')
highSavingsOlder['job'].value_counts().plot(kind='pie', ax=axs[2], autopct='%1.1f%%', title='Job')

plt.tight_layout()
plt.show()
