In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
#from sklearn.model_selection import train_test_split
#from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn_extra.cluster import KMedoids
from scipy.spatial.distance import euclidean

In [2]:
def complexity(ts):
    return np.sqrt(np.sum(np.diff(ts)**2))
# CID Distance function
def cid_distance(ts1, ts2):
    # Ensure both inputs are numpy arrays
    ts1 = np.asarray(ts1)
    ts2 = np.asarray(ts2)

    # Step 1: Compute standard Euclidean distance
    dist = euclidean(ts1, ts2)
    
    # Step 2: Compute the complexities of both time series
    c_ts1 = complexity(ts1)
    c_ts2 = complexity(ts2)
    
    # Step 3: Apply the CID formula
    return dist * np.sqrt(c_ts1 / c_ts2)

In [3]:
#18x18 grid, monday(4/11) - smsin / friday(8/11)
data = pd.read_csv("../csv/19x19/classif_18x18_smsin.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_smsin.csv")

data.head()

Unnamed: 0.1,Unnamed: 0,cellid,smsin0,smsin1,smsin2,smsin3,smsin4,smsin5,smsin6,smsin7,...,smsin15,smsin16,smsin17,smsin18,smsin19,smsin20,smsin21,smsin22,smsin23,WHF
0,0,4379,0.000378,0.000199,0.0,9e-06,0.000158,0.000204,0.000482,0.001992,...,0.004781,0.005944,0.00533,0.00547,0.0039,0.005816,0.003817,0.004231,0.000898,1
1,1,4380,0.001471,0.000311,2.8e-05,2e-06,0.000158,0.000302,0.000998,0.003448,...,0.008373,0.009585,0.010007,0.008427,0.007845,0.009887,0.006584,0.007445,0.00244,2
2,2,4381,0.001256,0.000577,5.2e-05,3e-05,8.6e-05,0.000308,0.000879,0.003441,...,0.006474,0.00629,0.006895,0.005045,0.004978,0.005282,0.003231,0.002469,0.001343,1
3,3,4382,0.002337,0.001663,0.000779,0.000445,0.00022,0.000678,0.002451,0.011564,...,0.023814,0.023962,0.02265,0.019704,0.017173,0.02117,0.016779,0.012174,0.008751,1
4,4,4383,0.002504,0.001642,0.000703,0.00039,0.000237,0.00071,0.002671,0.01231,...,0.022289,0.022932,0.023358,0.020317,0.018138,0.020992,0.017747,0.012685,0.009127,1


In [4]:
y = data.iloc[:,26] #h/w
x = data.iloc[:,2:26] #sms, calls, internet

In [5]:
y

0      1
1      2
2      1
3      1
4      1
      ..
236    2
237    2
238    2
239    2
240    2
Name: WHF, Length: 241, dtype: int64

In [6]:
x

Unnamed: 0,smsin0,smsin1,smsin2,smsin3,smsin4,smsin5,smsin6,smsin7,smsin8,smsin9,...,smsin14,smsin15,smsin16,smsin17,smsin18,smsin19,smsin20,smsin21,smsin22,smsin23
0,0.000378,0.000199,0.000000,0.000009,0.000158,0.000204,0.000482,0.001992,0.004211,0.005459,...,0.004141,0.004781,0.005944,0.005330,0.005470,0.003900,0.005816,0.003817,0.004231,0.000898
1,0.001471,0.000311,0.000028,0.000002,0.000158,0.000302,0.000998,0.003448,0.007770,0.008911,...,0.009113,0.008373,0.009585,0.010007,0.008427,0.007845,0.009887,0.006584,0.007445,0.002440
2,0.001256,0.000577,0.000052,0.000030,0.000086,0.000308,0.000879,0.003441,0.005561,0.008328,...,0.007181,0.006474,0.006290,0.006895,0.005045,0.004978,0.005282,0.003231,0.002469,0.001343
3,0.002337,0.001663,0.000779,0.000445,0.000220,0.000678,0.002451,0.011564,0.017392,0.020199,...,0.018495,0.023814,0.023962,0.022650,0.019704,0.017173,0.021170,0.016779,0.012174,0.008751
4,0.002504,0.001642,0.000703,0.000390,0.000237,0.000710,0.002671,0.012310,0.018621,0.021637,...,0.017875,0.022289,0.022932,0.023358,0.020317,0.018138,0.020992,0.017747,0.012685,0.009127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0.000422,0.000045,0.000043,0.000000,0.000022,0.000022,0.000282,0.001565,0.002740,0.003328,...,0.003405,0.003055,0.003520,0.004081,0.003484,0.002963,0.003333,0.002672,0.002220,0.000411
237,0.000177,0.000147,0.000000,0.000059,0.000000,0.000000,0.000066,0.000360,0.001415,0.001843,...,0.000576,0.001313,0.001666,0.000958,0.001260,0.002317,0.001349,0.000735,0.001961,0.001295
238,0.000163,0.000061,0.000000,0.000054,0.000000,0.000000,0.000102,0.000356,0.001002,0.001704,...,0.000701,0.001297,0.001293,0.000777,0.000702,0.001281,0.001225,0.000729,0.001581,0.001167
239,0.000009,0.000000,0.000000,0.000003,0.000000,0.000000,0.000041,0.000024,0.000228,0.000186,...,0.000177,0.000172,0.000203,0.000070,0.000083,0.000152,0.000072,0.000136,0.000150,0.000142


In [7]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# Initialize kNN classifier
knn = KNeighborsClassifier(n_neighbors=10, metric=cid_distance)
knn2 = KNeighborsClassifier(n_neighbors=10)

#Init svm and random forests
rf = RandomForestClassifier(n_estimators=200, random_state=0)
rf2 = RandomForestClassifier(n_estimators=250, random_state=0)
rf3 = RandomForestClassifier(n_estimators=500, random_state=0)
svm = RandomForestClassifier(n_estimators=750, random_state=0)
svm2 = RandomForestClassifier(n_estimators=100, random_state=0)
#svm = SVC(C=0.5)
#svm2 = SVC(C=0.1)

In [8]:
# Perform Stratified K-Fold cross-validation
fold_accuracies_knn = []
fold_accuracies_rf = []
fold_accuracies_rf2 = []
fold_accuracies_rf3 = []
fold_accuracies_svm = []
fold_accuracies_svm2 = []
predictions = np.zeros(len(y))  #Store the predictions

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the classifiers
    knn.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    rf2.fit(X_train, y_train)
    rf3.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    svm2.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    y_pred2 = rf.predict(X_test)
    y_pred22 = rf2.predict(X_test)
    y_pred222 = rf3.predict(X_test)
    y_pred3 = svm.predict(X_test)
    y_pred4 = svm2.predict(X_test)
    
    # Evaluate the models performances
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies_knn.append(accuracy)
    
    accuracy2 = accuracy_score(y_test, y_pred2)
    fold_accuracies_rf.append(accuracy2)

    accuracy22 = accuracy_score(y_test, y_pred22)
    fold_accuracies_rf2.append(accuracy22)

    accuracy222 = accuracy_score(y_test, y_pred222)
    fold_accuracies_rf3.append(accuracy222)

    accuracy3 = accuracy_score(y_test, y_pred3)
    fold_accuracies_svm.append(accuracy3)

    accuracy4 = accuracy_score(y_test, y_pred4)
    fold_accuracies_svm2.append(accuracy4)

    # Store predictions in the correct positions
    predictions[test_index] = y_pred
    
    #print(f"Fold {fold + 1}:")
    #print(f"Train indices: {train_index}, Test indices: {test_index}")
    #print(f"Accuracy: {accuracy:.4f}")
    #print("-" * 30)

# Summary of results
print(f"KNN - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_knn):.4f} ± {np.std(fold_accuracies_knn):.4f}")
print(f"RF - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf):.4f} ± {np.std(fold_accuracies_rf):.4f}")
print(f"RF2 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf2):.4f} ± {np.std(fold_accuracies_rf2):.4f}")
print(f"RF3 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf3):.4f} ± {np.std(fold_accuracies_rf3):.4f}")

print(f"SVM ovo - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm):.4f} ± {np.std(fold_accuracies_svm):.4f}")
print(f"SVM linear - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm2):.4f} ± {np.std(fold_accuracies_svm2):.4f}")

KNN - Mean accuracy + std over 10 folds: 0.5932 ± 0.0749
RF - Mean accuracy + std over 10 folds: 0.5933 ± 0.0740
RF2 - Mean accuracy + std over 10 folds: 0.6100 ± 0.0722
RF3 - Mean accuracy + std over 10 folds: 0.6058 ± 0.0771
SVM ovo - Mean accuracy + std over 10 folds: 0.5808 ± 0.0865
SVM linear - Mean accuracy + std over 10 folds: 0.5850 ± 0.0970


In [9]:
#np.savetxt("../ttest/knn_ci_smsin.csv", fold_accuracies_knn, delimiter=",", fmt='%.6f')
np.savetxt("../ttest/rf_200_smsin.csv", fold_accuracies_rf, delimiter=",", fmt='%.6f')
np.savetxt("../ttest/rf_500_smsin.csv", fold_accuracies_rf3, delimiter=",", fmt='%.6f')

In [10]:
scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [11]:
#print(scores1.mean(dtype=np.float64))
#print(scores2)
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(dtype=np.float64), scores2.std(dtype=np.float64)))
print(scores2)

5 fold: 0.50 accuracy with a standard deviation of 0.07
10 fold: 0.523500 accuracy with a standard deviation of 0.173360
[0.36       0.33333333 0.41666667 0.75       0.375      0.54166667
 0.375      0.70833333 0.54166667 0.83333333]


In [12]:
#y_predt = cross_val_predict(knn, x, y, cv=5) #10

In [13]:
data2 = pd.read_csv("../csv/19x19/classif_18x18_smsout.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_smsout.csv")

data2.head()
#20x20 grid, monday(4/11) - smsout / friday(8/11)

Unnamed: 0.1,Unnamed: 0,cellid,smsout0,smsout1,smsout2,smsout3,smsout4,smsout5,smsout6,smsout7,...,smsout15,smsout16,smsout17,smsout18,smsout19,smsout20,smsout21,smsout22,smsout23,WHF
0,0,4379,0.000506,0.000249,1e-05,2e-06,4.8e-05,5e-06,0.000482,0.001283,...,0.003694,0.002877,0.002676,0.003814,0.002715,0.003396,0.003801,0.002788,0.001381,1
1,1,4380,0.001622,0.001365,0.010339,0.011154,0.010858,0.011243,0.00889,0.002497,...,0.004583,0.005636,0.007947,0.013844,0.006955,0.005852,0.005189,0.005335,0.002228,2
2,2,4381,0.001115,0.00037,0.000138,0.0,8e-06,8e-06,0.000418,0.002093,...,0.003074,0.002176,0.002187,0.003226,0.002198,0.003144,0.001724,0.00197,0.001457,1
3,3,4382,0.00187,0.001042,0.001751,0.0,0.000118,0.000118,0.001525,0.011887,...,0.017044,0.013721,0.014986,0.016102,0.012975,0.016939,0.014571,0.012702,0.00993,1
4,4,4383,0.002081,0.001223,0.001724,1.9e-05,0.000104,0.000402,0.001724,0.013061,...,0.017347,0.015103,0.015889,0.017413,0.014364,0.01791,0.015497,0.013267,0.010571,1


In [14]:
y = data2.iloc[:,26] #h/w
x = data2.iloc[:,2:26] #sms, calls, internet

In [15]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [16]:
# Perform Stratified K-Fold cross-validation
fold_accuracies_knn = []
fold_accuracies_rf = []
fold_accuracies_rf2 = []
fold_accuracies_rf3 = []
fold_accuracies_svm = []
fold_accuracies_svm2 = []
#predictions = np.zeros(len(y))  #Store the predictions
#predictions2 = np.zeros(len(y))  #Store the predictions
#predictions3 = np.zeros(len(y))  #Store the predictions

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the classifiers
    knn.fit(X_train, y_train)
    #knn2.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    rf2.fit(X_train, y_train)
    rf3.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    svm2.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    #y_pred2 = knn.predict(X_test)
    y_pred2 = rf.predict(X_test)
    y_pred22 = rf2.predict(X_test)
    y_pred222 = rf3.predict(X_test)
    y_pred3 = svm.predict(X_test)
    y_pred4 = svm2.predict(X_test)
    
    # Evaluate the models performances
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies_knn.append(accuracy)
    
    accuracy2 = accuracy_score(y_test, y_pred2)
    fold_accuracies_rf.append(accuracy2)

    accuracy22 = accuracy_score(y_test, y_pred22)
    fold_accuracies_rf2.append(accuracy22)

    accuracy222 = accuracy_score(y_test, y_pred222)
    fold_accuracies_rf3.append(accuracy222)

    accuracy3 = accuracy_score(y_test, y_pred3)
    fold_accuracies_svm.append(accuracy3)

    accuracy4 = accuracy_score(y_test, y_pred4)
    fold_accuracies_svm2.append(accuracy4)

    # Store predictions in the correct positions
    #predictions[test_index] = y_pred
    #predictions2[test_index] = y_pred2
    #predictions3[test_index] = y_pred4
    
    #print(f"Fold {fold + 1}:")
    #print(f"Train indices: {train_index}, Test indices: {test_index}")
    #print(f"Accuracy: {accuracy:.4f}")
    #print("-" * 30)

# Summary of results
print(f"KNN - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_knn):.4f} ± {np.std(fold_accuracies_knn):.4f}")
print(f"RF - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf):.4f} ± {np.std(fold_accuracies_rf):.4f}")
print(f"RF2 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf2):.4f} ± {np.std(fold_accuracies_rf2):.4f}")
print(f"RF3 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf3):.4f} ± {np.std(fold_accuracies_rf3):.4f}")

print(f"SVM ovo - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm):.4f} ± {np.std(fold_accuracies_svm):.4f}")
print(f"SVM linear - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm2):.4f} ± {np.std(fold_accuracies_svm2):.4f}")

KNN - Mean accuracy + std over 10 folds: 0.5763 ± 0.0660
RF - Mean accuracy + std over 10 folds: 0.5725 ± 0.0751
RF2 - Mean accuracy + std over 10 folds: 0.5807 ± 0.0743
RF3 - Mean accuracy + std over 10 folds: 0.5765 ± 0.0797
SVM ovo - Mean accuracy + std over 10 folds: 0.5808 ± 0.0803
SVM linear - Mean accuracy + std over 10 folds: 0.5477 ± 0.0779


In [17]:
#np.savetxt("../ttest/knn_ci_smsout.csv", fold_accuracies_knn, delimiter=",", fmt='%.6f')
#np.savetxt("../ttest/knn_manh_smsout.csv", fold_accuracies_rf, delimiter=",", fmt='%.6f')

In [18]:
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(), scores2.std()))
print(scores2)

5 fold: 0.50 accuracy with a standard deviation of 0.10
10 fold: 0.527333 accuracy with a standard deviation of 0.126223
[0.44       0.375      0.375      0.54166667 0.5        0.45833333
 0.54166667 0.75       0.54166667 0.75      ]


In [19]:
#y_predt = cross_val_predict(knn, x, y, cv=5) #10

In [20]:
data = pd.read_csv("../csv/19x19/classif_18x18_callin.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_callin.csv")

data.head()
#20x20 grid, monday(4/11) - callin / friday(8/11)

Unnamed: 0.1,Unnamed: 0,cellid,callin0,callin1,callin2,callin3,callin4,callin5,callin6,callin7,...,callin15,callin16,callin17,callin18,callin19,callin20,callin21,callin22,callin23,WHF
0,0,4379,0.000204,0.000716,0.000118,0.0,0.0,0.000324,0.000149,0.002093,...,0.00915,0.013073,0.010728,0.009591,0.008018,0.004955,0.003183,0.001443,0.000482,1
1,1,4380,0.001311,0.000462,0.000277,0.0,0.000101,0.000388,0.000334,0.004638,...,0.014708,0.017623,0.016723,0.013522,0.013662,0.008128,0.004873,0.00261,0.001061,2
2,2,4381,0.000364,0.000249,0.0,0.0,0.0,0.000218,0.000499,0.004747,...,0.010988,0.011554,0.011801,0.008018,0.007783,0.00472,0.002418,0.000904,0.000619,1
3,3,4382,0.001501,0.000915,0.0,0.0,0.0,0.001501,0.002771,0.010943,...,0.031121,0.046033,0.046125,0.04856,0.024682,0.020695,0.009119,0.004223,0.004496,1
4,4,4383,0.001647,0.000863,1.4e-05,0.0,3.5e-05,0.00155,0.002651,0.011641,...,0.033412,0.046762,0.046555,0.051328,0.025946,0.021297,0.010219,0.004508,0.005045,1


In [21]:
y = data.iloc[:,26] #h/w
x = data.iloc[:,2:26] #sms, calls, internet

In [22]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [23]:
# Perform Stratified K-Fold cross-validation
fold_accuracies_knn = []
fold_accuracies_rf = []
fold_accuracies_rf2 = []
fold_accuracies_rf3 = []
fold_accuracies_svm = []
fold_accuracies_svm2 = []
#predictions2 = np.zeros(len(y))  #Store the predictions
#predictions3 = np.zeros(len(y)) 

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the classifiers
    knn.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    rf2.fit(X_train, y_train)
    rf3.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    svm2.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    y_pred2 = rf.predict(X_test)
    y_pred22 = rf2.predict(X_test)
    y_pred222 = rf3.predict(X_test)
    y_pred3 = svm.predict(X_test)
    y_pred4 = svm2.predict(X_test)
    
    # Evaluate the models performances
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies_knn.append(accuracy)
    
    accuracy2 = accuracy_score(y_test, y_pred2)
    fold_accuracies_rf.append(accuracy2)

    accuracy22 = accuracy_score(y_test, y_pred22)
    fold_accuracies_rf2.append(accuracy22)

    accuracy222 = accuracy_score(y_test, y_pred222)
    fold_accuracies_rf3.append(accuracy222)

    accuracy3 = accuracy_score(y_test, y_pred3)
    fold_accuracies_svm.append(accuracy3)

    accuracy4 = accuracy_score(y_test, y_pred4)
    fold_accuracies_svm2.append(accuracy4)

    # Store predictions in the correct positions
    #predictions2[test_index] = y_pred2
    #predictions3[test_index] = y_pred22
    
    #print(f"Fold {fold + 1}:")
    #print(f"Train indices: {train_index}, Test indices: {test_index}")
    #print(f"Accuracy: {accuracy:.4f}")
    #print("-" * 30)

# Summary of results
print(f"KNN - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_knn):.4f} ± {np.std(fold_accuracies_knn):.4f}")
print(f"RF - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf):.4f} ± {np.std(fold_accuracies_rf):.4f}")
print(f"RF2 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf2):.4f} ± {np.std(fold_accuracies_rf2):.4f}")
print(f"RF3 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf3):.4f} ± {np.std(fold_accuracies_rf3):.4f}")

print(f"SVM ovo - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm):.4f} ± {np.std(fold_accuracies_svm):.4f}")
print(f"SVM linear - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm2):.4f} ± {np.std(fold_accuracies_svm2):.4f}")

KNN - Mean accuracy + std over 10 folds: 0.5140 ± 0.0699
RF - Mean accuracy + std over 10 folds: 0.6263 ± 0.1076
RF2 - Mean accuracy + std over 10 folds: 0.6388 ± 0.1107
RF3 - Mean accuracy + std over 10 folds: 0.6470 ± 0.1098
SVM ovo - Mean accuracy + std over 10 folds: 0.6345 ± 0.1076
SVM linear - Mean accuracy + std over 10 folds: 0.6098 ± 0.1184


In [24]:
#np.savetxt("../ttest/svm_callin.csv", fold_accuracies_svm2, delimiter=",", fmt='%.6f')
np.savetxt("../ttest/rf_500_callin.csv", fold_accuracies_rf3, delimiter=",", fmt='%.6f')

In [25]:
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(dtype=np.float64), scores2.std(dtype=np.float64)))
print(scores2)

5 fold: 0.50 accuracy with a standard deviation of 0.11
10 fold: 0.506333 accuracy with a standard deviation of 0.126381
[0.48       0.375      0.5        0.45833333 0.33333333 0.41666667
 0.5        0.54166667 0.70833333 0.75      ]


In [26]:
#y_predt = cross_val_predict(knn, x, y, cv=5) #10

In [27]:
data = pd.read_csv("../csv/19x19/classif_18x18_callout.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_callout.csv")

data.head()
#20x20 grid, monday(4/11) - callout / friday(8/11)

Unnamed: 0.1,Unnamed: 0,cellid,callout0,callout1,callout2,callout3,callout4,callout5,callout6,callout7,...,callout15,callout16,callout17,callout18,callout19,callout20,callout21,callout22,callout23,WHF
0,0,4379,1e-05,9.7e-05,0.0,0.0,5e-06,0.000152,0.000584,0.002842,...,0.009999,0.009711,0.011318,0.008949,0.005031,0.005754,0.002906,0.001478,0.0006,1
1,1,4380,0.000157,5.7e-05,0.0,9e-06,0.0,0.000237,0.000978,0.006365,...,0.017284,0.016388,0.019477,0.015358,0.011301,0.008685,0.005553,0.002499,0.000675,2
2,2,4381,0.000294,4e-05,2.3e-05,1.7e-05,0.0,0.000282,0.000605,0.006603,...,0.010329,0.010004,0.012635,0.010447,0.005459,0.004089,0.003268,0.001387,0.000201,1
3,3,4382,0.000525,0.000497,0.000245,0.000252,0.0,0.000538,0.001771,0.013803,...,0.034587,0.036961,0.052016,0.038642,0.024501,0.022786,0.009294,0.00922,0.002008,1
4,4,4383,0.000594,0.000537,0.000273,0.000308,0.0,0.000442,0.00183,0.014267,...,0.036211,0.037631,0.053166,0.038921,0.02418,0.022625,0.010503,0.009822,0.0021,1


In [28]:
y = data.iloc[:,26] #h/w
x = data.iloc[:,2:26] #sms, calls, internet

In [29]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [30]:
# Perform Stratified K-Fold cross-validation
fold_accuracies_knn = []
fold_accuracies_rf = []
fold_accuracies_rf2 = []
fold_accuracies_rf3 = []
fold_accuracies_svm = []
fold_accuracies_svm2 = []
#predictions5 = np.zeros(len(y))  #Store the predictions
#predictions = np.zeros(len(y))

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the classifiers
    knn.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    rf2.fit(X_train, y_train)
    rf3.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    svm2.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    y_pred2 = rf.predict(X_test)
    y_pred22 = rf2.predict(X_test)
    y_pred222 = rf3.predict(X_test)
    y_pred3 = svm.predict(X_test)
    y_pred4 = svm2.predict(X_test)
    
    # Evaluate the models performances
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies_knn.append(accuracy)
    
    accuracy2 = accuracy_score(y_test, y_pred2)
    fold_accuracies_rf.append(accuracy2)

    accuracy22 = accuracy_score(y_test, y_pred22)
    fold_accuracies_rf2.append(accuracy22)

    accuracy222 = accuracy_score(y_test, y_pred222)
    fold_accuracies_rf3.append(accuracy222)

    accuracy3 = accuracy_score(y_test, y_pred3)
    fold_accuracies_svm.append(accuracy3)

    accuracy4 = accuracy_score(y_test, y_pred4)
    fold_accuracies_svm2.append(accuracy4)

    # Store predictions in the correct positions
    #predictions[test_index] = y_pred2
    
    #print(f"Fold {fold + 1}:")
    #print(f"Train indices: {train_index}, Test indices: {test_index}")
    #print(f"Accuracy: {accuracy:.4f}")
    #print("-" * 30)

# Summary of results
print(f"KNN - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_knn):.4f} ± {np.std(fold_accuracies_knn):.4f}")
print(f"RF - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf):.4f} ± {np.std(fold_accuracies_rf):.4f}")
print(f"RF2 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf2):.4f} ± {np.std(fold_accuracies_rf2):.4f}")
print(f"RF3 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf3):.4f} ± {np.std(fold_accuracies_rf3):.4f}")

print(f"SVM ovo - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm):.4f} ± {np.std(fold_accuracies_svm):.4f}")
print(f"SVM linear - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm2):.4f} ± {np.std(fold_accuracies_svm2):.4f}")

KNN - Mean accuracy + std over 10 folds: 0.5187 ± 0.0815
RF - Mean accuracy + std over 10 folds: 0.6057 ± 0.0977
RF2 - Mean accuracy + std over 10 folds: 0.6057 ± 0.0801
RF3 - Mean accuracy + std over 10 folds: 0.5765 ± 0.1042
SVM ovo - Mean accuracy + std over 10 folds: 0.5973 ± 0.1010
SVM linear - Mean accuracy + std over 10 folds: 0.5807 ± 0.0810


In [31]:
np.savetxt("../ttest/rf_750_callout.csv", fold_accuracies_svm, delimiter=",", fmt='%.6f')

In [32]:
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(dtype=np.float64), scores2.std(dtype=np.float64)))
print(scores2)

5 fold: 0.47 accuracy with a standard deviation of 0.12
10 fold: 0.465000 accuracy with a standard deviation of 0.137376
[0.4        0.29166667 0.58333333 0.54166667 0.20833333 0.33333333
 0.54166667 0.54166667 0.58333333 0.625     ]


In [33]:
y_predt = cross_val_predict(knn, x, y, cv=10) #10

In [34]:
data = pd.read_csv("../csv/19x19/classif_18x18_internet.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_internet.csv")

data.head()
#20x20 grid, monday(4/11) - internet / friday(8/11)

Unnamed: 0.1,Unnamed: 0,cellid,internet0,internet1,internet2,internet3,internet4,internet5,internet6,internet7,...,internet15,internet16,internet17,internet18,internet19,internet20,internet21,internet22,internet23,WHF
0,0,4379,0.002708,0.002231,0.002118,0.001976,0.002595,0.002364,0.004048,0.006266,...,0.006862,0.007546,0.00738,0.00802,0.006335,0.0063,0.005029,0.003849,0.003095,1
1,1,4380,0.004681,0.004419,0.00392,0.00356,0.003632,0.00389,0.005723,0.009949,...,0.010755,0.011996,0.012473,0.012097,0.011768,0.011386,0.009635,0.007705,0.006533,2
2,2,4381,0.002855,0.001887,0.001647,0.0016,0.001554,0.001969,0.002623,0.004532,...,0.005013,0.006633,0.006494,0.005845,0.005588,0.005884,0.005515,0.003641,0.003086,1
3,3,4382,0.01848,0.011011,0.009332,0.010083,0.010106,0.011391,0.015604,0.022403,...,0.023052,0.024342,0.027015,0.028007,0.025216,0.02569,0.026191,0.019743,0.0161,1
4,4,4383,0.0198,0.011374,0.009686,0.010369,0.010413,0.01155,0.015817,0.023754,...,0.025364,0.026221,0.02953,0.029931,0.026917,0.027519,0.027992,0.021135,0.017029,1


In [35]:
y = data.iloc[:,26] #h/w
x = data.iloc[:,2:26] #sms, calls, internet

In [36]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [37]:
# Perform Stratified K-Fold cross-validation
fold_accuracies_knn = []
fold_accuracies_rf = []
fold_accuracies_rf2 = []
fold_accuracies_rf3 = []
fold_accuracies_svm = []
fold_accuracies_svm2 = []
#predictions = np.zeros(len(y))  #Store the predictions

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the classifiers
    knn.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    rf2.fit(X_train, y_train)
    rf3.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    svm2.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    y_pred2 = rf.predict(X_test)
    y_pred22 = rf2.predict(X_test)
    y_pred222 = rf3.predict(X_test)
    y_pred3 = svm.predict(X_test)
    y_pred4 = svm2.predict(X_test)
    
    # Evaluate the models performances
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies_knn.append(accuracy)
    
    accuracy2 = accuracy_score(y_test, y_pred2)
    fold_accuracies_rf.append(accuracy2)

    accuracy22 = accuracy_score(y_test, y_pred22)
    fold_accuracies_rf2.append(accuracy22)

    accuracy222 = accuracy_score(y_test, y_pred222)
    fold_accuracies_rf3.append(accuracy222)

    accuracy3 = accuracy_score(y_test, y_pred3)
    fold_accuracies_svm.append(accuracy3)

    accuracy4 = accuracy_score(y_test, y_pred4)
    fold_accuracies_svm2.append(accuracy4)

    # Store predictions in the correct positions
    #predictions[test_index] = y_pred2
    
    #print(f"Fold {fold + 1}:")
    #print(f"Train indices: {train_index}, Test indices: {test_index}")
    #print(f"Accuracy: {accuracy:.4f}")
    #print("-" * 30)

# Summary of results
print(f"KNN - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_knn):.4f} ± {np.std(fold_accuracies_knn):.4f}")
print(f"RF - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf):.4f} ± {np.std(fold_accuracies_rf):.4f}")
print(f"RF2 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf2):.4f} ± {np.std(fold_accuracies_rf2):.4f}")
print(f"RF3 - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_rf3):.4f} ± {np.std(fold_accuracies_rf3):.4f}")

print(f"SVM ovo - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm):.4f} ± {np.std(fold_accuracies_svm):.4f}")
print(f"SVM linear - Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies_svm2):.4f} ± {np.std(fold_accuracies_svm2):.4f}")

KNN - Mean accuracy + std over 10 folds: 0.5603 ± 0.0804
RF - Mean accuracy + std over 10 folds: 0.5555 ± 0.1070
RF2 - Mean accuracy + std over 10 folds: 0.5557 ± 0.1016
RF3 - Mean accuracy + std over 10 folds: 0.5682 ± 0.1050
SVM ovo - Mean accuracy + std over 10 folds: 0.5640 ± 0.1035
SVM linear - Mean accuracy + std over 10 folds: 0.5678 ± 0.1007


In [38]:
#np.savetxt("../ttest/svm_internet.csv", fold_accuracies_svm2, delimiter=",", fmt='%.6f')
#np.savetxt("../ttest/knn_ci_internet.csv", fold_accuracies_knn, delimiter=",", fmt='%.6f')
#np.savetxt("../ttest/knn_manh_internet.csv", fold_accuracies_rf, delimiter=",", fmt='%.6f')

In [39]:
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(dtype=np.float64), scores2.std(dtype=np.float64)))
print(scores2)

5 fold: 0.46 accuracy with a standard deviation of 0.10
10 fold: 0.490000 accuracy with a standard deviation of 0.109595
[0.4        0.5        0.33333333 0.45833333 0.45833333 0.45833333
 0.41666667 0.54166667 0.58333333 0.75      ]


In [40]:
#y_predt = cross_val_predict(knn, x, y, cv=5) #10

In [41]:
predicted = pd.DataFrame(np.transpose(predictions).astype(int),columns=['predicted'])
predicted['cellid']=data2.cellid
predicted.head() #callin rf50

Unnamed: 0,predicted,cellid
0,1,4379
1,2,4380
2,1,4381
3,1,4382
4,1,4383


In [42]:
#predicted2 = pd.DataFrame(np.transpose(predictions2).astype(int),columns=['predicted'])
#predicted2['cellid']=data2.cellid
#predicted2.head()

In [43]:
#predicted3 = pd.DataFrame(np.transpose(predictions3).astype(int),columns=['predicted'])
#predicted3['cellid']=data2.cellid
#predicted3.head() #callout rf50

In [44]:
#predicted4 = pd.DataFrame(np.transpose(predictions4).astype(int),columns=['predicted'])
#predicted4['cellid']=data2.cellid
#predicted4.head()

In [45]:
#predicted5 = pd.DataFrame(np.transpose(predictions5).astype(int),columns=['predicted'])
#predicted5['cellid']=data2.cellid
#predicted5.head()

In [46]:
#predicted.to_csv('../csv/predicted_smsin_knn10cid_19x19.csv')

In [47]:
#predicted2.to_csv('../csv/predicted_smsout_knn10manh_19x19.csv')

In [48]:
#predicted3.to_csv('../csv/predicted_callout_rf50_19x19.csv')

In [49]:
#predicted4.to_csv('../csv/predicted_callin_rf200_18x18_new2.csv')

In [50]:
#predicted5.to_csv('../csv/predicted_callout_rf50_18x18_new2.csv')