In [1]:
import pandas as pd
import scipy
from scipy import stats, optimize, interpolate
import sklearn
import matplotlib.pyplot as plt
from sklearn import preprocessing
import sklearn.metrics as metrics


In [2]:
df = pd.read_csv('tryMe_balanced.csv')
df.head()

Unnamed: 0,bits,non_zero_pixels,movement_level,mean,sub_mean_1,sub_mean_2,sub_mean_3,sub_mean_4,var_sub_blocks,sobel_h,...,cost_2,quality,intra_parts,skip_parts,inter_16x16_parts,inter_4x4_parts,inter_other_parts,frame_width,frame_height,relevant
0,0.235171,0.283816,0.025359,0.002366,0.061133,0.051159,0.119325,0.157762,0.000674,0.201656,...,0.239385,1,0,0,0,0,13,1,1,1
1,0.044435,0.025362,0.244842,0.003134,0.076514,0.067665,0.187908,0.180386,0.002005,0.200689,...,0.037362,3,5,0,2,0,2,2,2,1
2,0.098093,0.109903,0.921025,0.001312,0.051514,0.018864,0.095587,0.03232,0.002125,0.111573,...,0.016832,0,6,1,1,0,0,3,3,1
3,0.245441,0.23913,0.027827,0.001109,0.049414,0.005741,0.055122,0.067872,0.001425,0.091231,...,0.061133,0,0,0,2,1,7,1,1,1
4,0.437225,0.48913,0.025254,0.00607,0.26416,0.062846,0.32866,0.288658,0.032566,0.458804,...,0.410566,1,14,0,0,0,0,1,1,1


In [None]:
# Checking for null, nan
df.isnull().any()
df  = df.dropna() # by now
df.shape

In [None]:
f = plt.figure(figsize=(10, 7))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.shape[1]), df.columns, fontsize=14, rotation=90)
plt.yticks(range(df.shape[1]), df.columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);      

In [None]:
# selected correlations
print(df["quality"].corr(df["relevant"]))
print(df["bits"].corr(df["relevant"]))
df["quality"].corr(df["bits"])

In [None]:
# creating a table of categorical and continuous data
cat = df.loc[:, df.nunique() < 200]
cont = df.loc[:, df.nunique() >= 200]

In [None]:
# Data Normalization
cont_val = cont.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
cont_scaled = min_max_scaler.fit_transform(cont_val)
cont_sc = pd.DataFrame(cont_scaled)
cont_sc.columns=cont.columns

In [None]:
cont_sc = cont_sc.round(3)

In [None]:
#cont_sc.head()

In [None]:
cont.corrwith(cat.relevant)

In [None]:
cont_sc.corrwith(cat.relevant)  # looks like normalozation of categorical variaables doesn't have an affect on correlation

In [None]:
# Caregorical data factorization
cat_fc =  pd.DataFrame(cat.apply(lambda x: pd.factorize(x)[0]))
cat_fc.head()

In [None]:
cat.corrwith(cat.relevant)

In [None]:
cat_fc.corrwith(cat.relevant) # feels like some date is not categorical as corelation reduces

In [None]:
cat_fc.head()

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import recall_score
from sklearn.metrics import balanced_accuracy_score

from sklearn.metrics import precision_recall_curve


In [7]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
#print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))


confusion = metrics.confusion_matrix(y_test,y_pred)

TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]

accuracy = (TP + TN) / float(TP+TN+FP+FN) # metrics.accuracy_score(y_test, y_pred)
sensitiviy = TP / float(TP+FN)  #recall metrics.recall_score(y_test, y_pred)
specificity = TN / float(TN+FP) #when the actual value is negative, how often is the predicion correct?
precision = TP / float(TP+FP)   #metrics.precision_score(y_test, y_pred)

print("accuracy", accuracy.round(4))  
print("recall", sensitiviy.round(4))
print("specificity", specificity.round(4))
print("precision",precision.round(4))

accuracy 0.6979
recall 0.6639
specificity 0.7319
precision 0.7116


In [8]:
from sklearn import tree

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.3, random_state=0)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train2, y_train2)

y_pred2 = clf.predict(X_test2)


confusion = metrics.confusion_matrix(y_test2,y_pred2)

TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]

accuracy = (TP + TN) / float(TP+TN+FP+FN) # metrics.accuracy_score(y_test, y_pred)
sensitiviy = TP / float(TP+TN)  #recall metrics.recall_score(y_test, y_pred)
specificity = TN / float(TN+FP) #when the actual value is negative, how often is the predicion correct?
precision = TP / float(TP+FP)   #metrics.precision_score(y_test, y_pred)

print("accuracy", accuracy.round(4))  
print("recall", sensitiviy.round(4))
print("specificity", specificity.round(4))
print("precision",precision.round(4))


#tree.plot_tree(clf) 




accuracy 0.6246
recall 0.492
specificity 0.6335
precision 0.626


In [9]:
from sklearn.ensemble import RandomForestClassifier




X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.3, random_state=0)


#Create a Gaussian Classifier
clf=RandomForestClassifier()

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train3,y_train3)

y_pred3=clf.predict(X_test3)

confusion = metrics.confusion_matrix(y_test3,y_pred3)

TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]

accuracy = (TP + TN) / float(TP+TN+FP+FN) # metrics.accuracy_score(y_test, y_pred)
sensitiviy = TP / float(TP+TN)  #recall metrics.recall_score(y_test, y_pred)
specificity = TN / float(TN+FP) #when the actual value is negative, how often is the predicion correct?
precision = TP / float(TP+FP)   #metrics.precision_score(y_test, y_pred)

print("accuracy", accuracy.round(4))  
print("recall", sensitiviy.round(4))
print("specificity", specificity.round(4))
print("precision",precision.round(4))

accuracy 0.7173
recall 0.4841
specificity 0.7389
precision 0.7264


In [None]:
data = pd.concat([cont_sc, cat_fc], axis=1)

In [None]:
data.shape

In [None]:
data_t = data.dropna()

In [None]:
data_t.shape

In [None]:
data.head()

In [None]:
X = data_t[data_t.columns[:-1]]
y = data_t[data_t.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


confusion = metrics.confusion_matrix(y_test,y_pred)

TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]

accuracy = (TP + TN) / float(TP+TN+FP+FN) # metrics.accuracy_score(y_test, y_pred)
sensitiviy = TP / float(TP+FN)  #recall metrics.recall_score(y_test, y_pred)

specificity = TN / float(TN+FP) #when the actual value is negative, how often is the predicion correct?
precision = TP / float(TP+FP)   #metrics.precision_score(y_test, y_pred)

print("accuracy", accuracy.round(4))  
print("recall", sensitiviy.round(4))
print("specificity", specificity.round(4))
print("precision",precision.round(4))


print(confusion)

In [None]:
from sklearn import tree

X = data_t[data_t.columns[:-1]]
y = data_t[data_t.columns[-1]]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.3, random_state=0)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train2, y_train2)

y_pred2 = clf.predict(X_test2)


confusion = metrics.confusion_matrix(y_test2,y_pred2)

TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]

accuracy = (TP + TN) / float(TP+TN+FP+FN) # metrics.accuracy_score(y_test, y_pred)
sensitiviy = TP / float(TP+FN)  #recall metrics.recall_score(y_test, y_pred)
specificity = TN / float(TN+FP) #when the actual value is negative, how often is the predicion correct?
precision = TP / float(TP+FP)   #metrics.precision_score(y_test, y_pred)

print("accuracy", accuracy.round(4))  
print("recall", sensitiviy.round(4))
print("specificity", specificity.round(4))
print("precision",precision.round(4))



print(confusion)
#tree.plot_tree(clf) 




In [None]:
from sklearn.ensemble import RandomForestClassifier


X = data_t[data_t.columns[:-1]]
y = data_t[data_t.columns[-1]]


X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.3, random_state=0)


#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train3,y_train3)

y_pred3=clf.predict(X_test3)

confusion = metrics.confusion_matrix(y_test3,y_pred3)

TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]

accuracy = (TP + TN) / float(TP+TN+FP+FN) # metrics.accuracy_score(y_test, y_pred)
sensitiviy = TP / float(TP+FN)  #recall metrics.recall_score(y_test, y_pred)
specificity = TN / float(TN+FP) #when the actual value is negative, how often is the predicion correct?
precision = TP / float(TP+FP)   #metrics.precision_score(y_test, y_pred)


print("accuracy", accuracy.round(4))  
print("recall", sensitiviy.round(4))
print("specificity", specificity.round(4))
print("precision",precision.round(4))


print(confusion)



In [None]:

#KMEANS


In [None]:
from kneed import KneeLocator
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
import matplotlib.cm as cm
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
import seaborn as sns

from kneed import KneeLocator
from sklearn.cluster import KMeans
import numpy as np






In [None]:
kmeans = KMeans()
kmeans.fit(data_t)

In [None]:
print(kmeans.inertia_)

In [None]:
print(kmeans.cluster_centers_)

In [None]:
print(kmeans.n_iter_)

In [None]:
# A list holds the SSE values for each k
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data_t)
    sse.append(kmeans.inertia_)

In [None]:
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.title("Elbow method")
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
data_t

In [None]:
kl = KneeLocator(range(1, 11), 
                 sse, 
                 curve="convex", 
                 direction="decreasing")
kl.elbow

In [None]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data_t)
    score = silhouette_score(data_t, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.plot(range(2, 11), silhouette_coefficients)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
from sklearn.metrics import adjusted_rand_score


import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


In [None]:
scaler = StandardScaler()

In [None]:
features, true_labels = make_moons(
    n_samples=250, 
    noise=0.05, 
    random_state=42  )
scaled_features = scaler.fit_transform(features)

In [None]:
kmeans = KMeans(n_clusters=2)
dbscan = DBSCAN(eps=0.3)

# Fit the algorithms to the features
kmeans.fit(scaled_features)
dbscan.fit(scaled_features)

 # Compute the silhouette scores for each algorithm
kmeans_silhouette = silhouette_score(
    scaled_features, kmeans.labels_
     ).round(2)
dbscan_silhouette = silhouette_score(
    scaled_features, dbscan.labels_
     ).round (2)

In [None]:
kmeans_silhouette
dbscan_silhouette

In [None]:
ari_kmeans = adjusted_rand_score(true_labels, kmeans.labels_)
ari_dbscan = adjusted_rand_score(true_labels, dbscan.labels_)

print(round(ari_kmeans, 2))
 

print(round(ari_dbscan, 2))


In [None]:
# Convert DataFrame to matrix
import pandas

mat = data_t.values
# Using sklearn
km = sklearn.cluster.KMeans(n_clusters=5)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pandas.DataFrame([data_t.index,labels]).T

In [None]:
results

In [None]:
def silhouette_analysis(X, X_pca, range_n_clusters):
    
    for n_clusters in range_n_clusters:
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        ax1.set_xlim([-0.1, 1])
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            y_lower = y_upper + 10  

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X_pca[0], X_pca[1], c=colors)
    
        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("PCA component 1")
        ax2.set_ylabel("PCA component 2")

        plt.suptitle(("\nSilhouette analysis for KMeans clustering "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')

    plt.show()

In [None]:
pca=PCA(n_components=2)
data_pca=pd.DataFrame(pca.fit_transform(data_t))

In [None]:
silhouette_analysis(np.array(data_t), 
                    cont_pca, 
                    [2, 3, 4, 5, 6, 7, 8])

In [None]:
distances_linkage = linkage(data_t,
                            method = 'ward', 
                            metric = 'euclidean'
                           )

In [None]:
plt.figure(figsize=(12, 12))

plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('BITS')
plt.ylabel('Distance')

dendrogram(
    distances_linkage,
    #color_threshold = 0,
    no_labels = True
)

plt.show()

In [None]:
plt.figure(figsize=(12, 12))

plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('BITS')
plt.ylabel('Distance')

dendrogram(
    distances_linkage,
    truncate_mode='lastp',
    p=10,
    show_leaf_counts=True,    #False 
    show_contracted=True,
    #color_threshold = 0,
    no_labels = True
)

plt.show()

In [None]:
hcluster = AgglomerativeClustering(n_clusters = 3,
                                   affinity = 'euclidean',
                                   linkage = 'ward')
hcluster.fit_predict(data_t)

plt.figure(figsize=(9, 7))


plt.scatter(data_pca[0], 
            data_pca[1],
            c=hcluster.labels_
           )

plt.title("The visualization of the clustered data")
plt.xlabel("PCA component 1")
plt.ylabel("PCA component 2")
    
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 3)
kmeans.fit(data_t)
data_t['cluster'] = kmeans.labels_.astype(str)

In [None]:
sns.countplot(data=data_t.sort_values(by='cluster'), 
              x='cluster', 
              hue='relevant',
              palette='Set1')

In [None]:
sns.pairplot(data_t, 
             hue = 'cluster',
             palette='Set1')