## Exercise 5
### Classification and Clustering
### Amir - 2022
# ....................... 


In [None]:
''' 
1- Open creditcard2_smaller.csv file
2- there are roughly 10000 credit card transactions information saved here
3- for each transaction, there are 29 features (columns 0 to 28), and a transaction class label (column 29) 
4- class label== 0 --> a valid transaction, whereas class label== 1 --> a fraudulent transaction
5- plot the histogram of class labels, what you can get from that?
6- keep 70% of your samples for training, and 30% for testing. 
7- develop a decision tree classifier, train it, and test it. 
8- report the confusion matrix and the classification report
9- Try to optimize the parameters and run the program once again
10- report the best results achived
11- What Is The Biggest Weakness of your classifier?
12- repeat steps 7 to 11 for a logistic regressor model

'''

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')
df = pd.read_csv('creditcard2_smaller.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.head()

In [None]:
plt.rcParams["figure.figsize"] = (5,50)
fig, axs = plt.subplots(len(df.columns))
for i in range(0,len(df.columns)):
    axs[i].hist(df.iloc[i])

In [None]:
# df[df.iloc[-1]==1]['fico'].plot.hist(bins=30,alpha=0.5,color='blue', label='Valid=1')
# df[df.iloc[-1]==0]['fico'].plot.hist(bins=30,alpha=0.5, color='red', label='Valid=0')
# plt.legend(fontsize=15)
# plt.title ("Histogram of FICO score by approved or disapproved credit policies", fontsize=16)
# plt.xlabel("FICO score", fontsize=14)

In [None]:
from sklearn.model_selection import train_test_split
df_dropped = df.iloc[: , :-1]
X = df_dropped
y = df.iloc[:,-1:]
# y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier(criterion='gini',max_depth=None)

In [None]:
dtree.fit(X_train,y_train)

In [None]:
predictions = dtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
cm=confusion_matrix(y_test,predictions)
print(cm)
print ("Accuracy of prediction:",round((cm[0,0]+cm[1,1])/cm.sum(),3))

In [None]:
from matplotlib.colors import LogNorm, Normalize
plt.rcParams["figure.figsize"] = (5,5)
sns.heatmap(cm,linewidth=1, annot=True, norm=LogNorm())
plt.show()

In [None]:
print(classification_report(y_test,predictions))

In [None]:
# Use AUC(Area under curve) as evaluation metric as it is a binary classification problem
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(X_train, y_train)   
    train_pred = dt.predict(X_train)   
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous train results
    train_results.append(roc_auc)   
    y_pred = dt.predict(X_test)   
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous test results
    test_results.append(roc_auc)
    
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_depths, train_results, 'b', label='Train AUC')
line2, = plt.plot(max_depths, test_results, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('Tree depth')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
nsimu = 21
accuracy=[0]*nsimu
ntree = [0]*nsimu
cmatrices = [0]*nsimu
for i in range(1,nsimu):
    rfc = RandomForestClassifier(n_estimators=i*5,min_samples_split=10,max_depth=None,criterion='gini')
    rfc.fit(X_train, y_train.values.ravel())
    rfc_pred = rfc.predict(X_test)
#     cm = confusion_matrix(y_test,rfc_pred)
#     accuracy[i] = (cm[0,0]+cm[1,1])/cm.sum()
    accuracy[i] = accuracy_score(y_test, rfc_pred)
    cmatrices[i] = confusion_matrix(y_test,rfc_pred)
    ntree[i]=i*5

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(x=ntree[1:nsimu],y=accuracy[1:nsimu],s=60,c='red')
plt.title("Number of trees in the Random Forest vs. prediction accuracy (criterion: 'gini')", fontsize=18)
plt.xlabel("Number of trees", fontsize=15)
plt.ylabel("Prediction accuracy from confusion matrix", fontsize=15)

In [None]:
sns.heatmap(cm,linewidth=1, annot=True, norm=LogNorm())
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
nsimu=10
penalty=[0]*nsimu
logmodel=[0]*nsimu
predictions =[0]*nsimu
class_report = [0]*nsimu
f1=[0]*nsimu
for i in range(1,nsimu):
        logmodel[i] =LogisticRegression(C=i/100,tol=1e-4, max_iter=600,n_jobs=4)
        logmodel[i].fit(X_train,y_train.values.ravel())
        predictions[i] = logmodel[i].predict(X_test)
        class_report[i] = classification_report(y_test,predictions[i])
        l=class_report[i].split()
        f1[i] = l[len(l)-2]
        penalty[i]=1000/i
        
plt.scatter(penalty[1:len(penalty)-2],f1[1:len(f1)-2])
plt.title("F1-score vs. regularization parameter",fontsize=20)
plt.xlabel("Penalty parameter",fontsize=17)
plt.ylabel("F1-score on test data",fontsize=17)
plt.show()



In [None]:
cm=confusion_matrix(y_test,predictions[1])
print(cm)
print ("Accuracy of prediction:",round((cm[0,0]+cm[1,1])/cm.sum(),3))

In [None]:
sns.heatmap(cm,linewidth=1, annot=True, norm=LogNorm())
plt.show()

In [None]:
'''  
1- Open titanc_train.csv
2- Remove any data samples with empty cells
3- Try to measure the correlation between 'Pclass' and 'Fare'
4- Try to measure the correlation between 'Pclass' and 'Survived'
5- Use variables below and C-Means (K-Means) clustering, try to cluster the data into 2 clusters
Variables: Survived, Pclass, Sex, Age, Fare, Embarked
6- Discuss the results
7- Repeat 5 and 6 for 4 clusters
'''

In [None]:
df_train = pd.read_csv('titanic_train.csv') # Training set is already available
df_train.head()

In [None]:
df_train.info()

In [None]:
df_Pclass_Fare = df_train[["Pclass","Fare"]]
df_Pclass_Fare.corr()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df_Pclass_Fare.corr(),annot=True,linewidths=2, cmap="gray")

In [None]:
df_Pclass_Survived = df_train[["Pclass","Survived"]]
df_Pclass_Survived.corr()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df_Pclass_Survived.corr(),annot=True,linewidths=2)

In [None]:
from sklearn.cluster import KMeans

In [None]:
df_train_km = df_train
df_train_km.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
df_train_km.drop(['SibSp','Parch','Cabin'],axis=1,inplace=True)
sex = pd.get_dummies(df_train_km['Sex'],drop_first=True)
embark = pd.get_dummies(df_train_km['Embarked'],drop_first=True)
df_train_km.drop(['Sex','Embarked'],axis=1,inplace=True)
df_train_km = pd.concat([df_train_km,sex,embark],axis=1)

In [None]:
f_class_Age=df_train_km.groupby('Pclass')['Age'].mean()
f_class_Age = pd.DataFrame(f_class_Age)
f_class_Age.plot.bar(y='Age')
plt.title("Average age of passengers by class",fontsize=17)
plt.ylabel("Age (years)", fontsize=17)
plt.xlabel("Passenger class", fontsize=17)

In [None]:
a=list(f_class_Age['Age'])
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return a[0]

        elif Pclass == 2:
            return a[1]

        else:
            return a[2]

    else:
        return Age
    
df_train_km['Age'] = df_train_km[['Age','Pclass']].apply(impute_age,axis=1)
d=df_train_km.describe()
dT=d.T
dT.plot.bar(y='count')
plt.title("Bar plot of the count of numeric features",fontsize=17)

In [None]:
df_train_km.info()

In [None]:
kmeans.fit(df_train_km)

In [None]:
clus_cent=kmeans.cluster_centers_
clus_cent.shape

In [None]:
df_train_km[df_train_km['Survived']==1].describe() # Statistics for private colleges only

In [None]:
df_train_km[df_train_km['Survived']==0].describe() # Statistics for private colleges only

In [None]:
df_desc=pd.DataFrame(df_train_km.describe())
feat = list(df_desc.columns)
print(feat)
kmclus = pd.DataFrame(clus_cent,columns=feat)
kmclus.shape

In [None]:
kmeans.labels_

In [None]:
def converter(cluster):
    if cluster==1:
        return 1
    else:
        return 0

In [None]:
df1=df_train_km # Create a copy of data frame so that original data frame does not get 'corrupted' with the cluster index
df1['Cluster'] = df_train_km['Survived'].apply(converter)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(df1['Cluster'],kmeans.labels_))
print(classification_report(df1['Cluster'],kmeans.labels_))

In [None]:
df_survived=df_train_km[df_train_km['Survived']==1]
df_non_survived=df_train_km[df_train_km['Survived']==0]

In [None]:
kmeans = KMeans(n_clusters=2,verbose=0,tol=1e-3,max_iter=50,n_init=10)
kmeans.fit(df_train_km.drop('Survived',axis=1))

clus_cent=kmeans.cluster_centers_

df_desc=pd.DataFrame(df_train_km.drop('Survived',axis=1).describe())
feat = list(df_desc.columns)
# print(feat)
# print(clus_cent.shape)

kmclus = pd.DataFrame(clus_cent,columns=feat)
a=np.array(kmclus.diff().iloc[1])


In [None]:
centroid_diff = pd.DataFrame(a,columns=['K-means cluster centroid-distance'],index=df_desc.columns)
# centroid_diff

In [None]:
# df_survived.mean()

In [None]:
centroid_diff['Mean of corresponding entity (survived)']=np.array(df_survived.drop('Survived',axis=1).mean())
centroid_diff['Mean of corresponding entity (non_survived)']=np.array(df_non_survived.drop('Survived',axis=1).mean())
# centroid_diff

In [None]:
# kmeans.labels_

In [None]:
def converter(cluster):
    if cluster==1:
        return 1
    else:
        return 0

In [None]:
df1=df_train_km # Create a copy of data frame so that original data frame does not get 'corrupted' with the cluster index
df1['Cluster'] = df_train_km['Survived'].apply(converter)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(df1['Cluster'],kmeans.labels_))
print(classification_report(df1['Cluster'],kmeans.labels_))

In [None]:
df_survived=df_train_km[df_train_km['Survived']==1]
df_non_survived=df_train_km[df_train_km['Survived']==0]

In [None]:
kmeans = KMeans(n_clusters=4,verbose=0,tol=1e-3,max_iter=50,n_init=10)
kmeans.fit(df_train_km.drop('Survived',axis=1))

clus_cent=kmeans.cluster_centers_

df_desc=pd.DataFrame(df_train_km.drop('Survived',axis=1).describe())
feat = list(df_desc.columns)
print(feat)
print(clus_cent.shape)

kmclus = pd.DataFrame(clus_cent,columns=feat)
a=np.array(kmclus.diff().iloc[1])
