In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.model_selection import train_test_split
#from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn_extra.cluster import KMedoids
from scipy.spatial.distance import euclidean

In [2]:
def complexity(ts):
    return np.sqrt(np.sum(np.diff(ts)**2))
# CID Distance function
def cid_distance(ts1, ts2):
    # Ensure both inputs are numpy arrays
    ts1 = np.asarray(ts1)
    ts2 = np.asarray(ts2)

    # Step 1: Compute standard Euclidean distance
    dist = euclidean(ts1, ts2)
    
    # Step 2: Compute the complexities of both time series
    c_ts1 = complexity(ts1)
    c_ts2 = complexity(ts2)
    
    # Step 3: Apply the CID formula
    return dist * np.sqrt(c_ts1 / c_ts2)

In [3]:
#15x15 grid, monday(4/11) - smsin / friday(8/11)
data = pd.read_csv("../csv/15x15/classif_15x15_smsin.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_smsin.csv")

data.head()

Unnamed: 0.1,Unnamed: 0,cellid,smsin0,smsin1,smsin2,smsin3,smsin4,smsin5,smsin6,smsin7,...,smsin15,smsin16,smsin17,smsin18,smsin19,smsin20,smsin21,smsin22,smsin23,WHF
0,0,4379,0.000378,0.000199,0.0,9e-06,0.000158,0.000204,0.000482,0.001992,...,0.004781,0.005944,0.00533,0.00547,0.0039,0.005816,0.003817,0.004231,0.000898,1
1,1,4380,0.001471,0.000311,2.8e-05,2e-06,0.000158,0.000302,0.000998,0.003448,...,0.008373,0.009585,0.010007,0.008427,0.007845,0.009887,0.006584,0.007445,0.00244,0
2,2,4381,0.001256,0.000577,5.2e-05,3e-05,8.6e-05,0.000308,0.000879,0.003441,...,0.006474,0.00629,0.006895,0.005045,0.004978,0.005282,0.003231,0.002469,0.001343,1
3,3,4382,0.002337,0.001663,0.000779,0.000445,0.00022,0.000678,0.002451,0.011564,...,0.023814,0.023962,0.02265,0.019704,0.017173,0.02117,0.016779,0.012174,0.008751,1
4,4,4383,0.002504,0.001642,0.000703,0.00039,0.000237,0.00071,0.002671,0.01231,...,0.022289,0.022932,0.023358,0.020317,0.018138,0.020992,0.017747,0.012685,0.009127,1


In [4]:
y = data.iloc[:,26] #h/w
x = data.iloc[:,2:26] #sms, calls, internet

In [5]:
y

0      1
1      0
2      1
3      1
4      1
      ..
220    0
221    1
222    1
223    1
224    2
Name: WHF, Length: 225, dtype: int64

In [6]:
x

Unnamed: 0,smsin0,smsin1,smsin2,smsin3,smsin4,smsin5,smsin6,smsin7,smsin8,smsin9,...,smsin14,smsin15,smsin16,smsin17,smsin18,smsin19,smsin20,smsin21,smsin22,smsin23
0,0.000378,0.000199,0.000000,8.946567e-06,1.580152e-04,0.000204,0.000482,0.001992,0.004211,0.005459,...,0.004141,0.004781,0.005944,0.005330,0.005470,0.003900,0.005816,0.003817,0.004231,0.000898
1,0.001471,0.000311,0.000028,1.616931e-06,1.583869e-04,0.000302,0.000998,0.003448,0.007770,0.008911,...,0.009113,0.008373,0.009585,0.010007,0.008427,0.007845,0.009887,0.006584,0.007445,0.002440
2,0.001256,0.000577,0.000052,2.980867e-05,8.597758e-05,0.000308,0.000879,0.003441,0.005561,0.008328,...,0.007181,0.006474,0.006290,0.006895,0.005045,0.004978,0.005282,0.003231,0.002469,0.001343
3,0.002337,0.001663,0.000779,4.450882e-04,2.196266e-04,0.000678,0.002451,0.011564,0.017392,0.020199,...,0.018495,0.023814,0.023962,0.022650,0.019704,0.017173,0.021170,0.016779,0.012174,0.008751
4,0.002504,0.001642,0.000703,3.902351e-04,2.372259e-04,0.000710,0.002671,0.012310,0.018621,0.021637,...,0.017875,0.022289,0.022932,0.023358,0.020317,0.018138,0.020992,0.017747,0.012685,0.009127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,0.001523,0.001134,0.000029,1.578763e-04,1.851365e-04,0.000016,0.000537,0.001786,0.005320,0.008963,...,0.005071,0.005746,0.004949,0.004794,0.005304,0.009425,0.006844,0.005351,0.001916,0.000838
221,0.000927,0.000360,0.000429,4.850104e-05,4.446094e-04,0.000041,0.001022,0.002801,0.004000,0.005747,...,0.004785,0.004672,0.005891,0.006787,0.003789,0.009461,0.008002,0.002404,0.003204,0.001205
222,0.000974,0.000106,0.000392,0.000000e+00,3.424626e-04,0.000052,0.001261,0.003754,0.004570,0.005064,...,0.005572,0.005164,0.005633,0.007200,0.004206,0.008905,0.007998,0.003387,0.003485,0.001368
223,0.000966,0.000214,0.000078,1.863883e-07,1.863883e-07,0.000038,0.000764,0.001975,0.003723,0.003381,...,0.003307,0.003634,0.003777,0.003685,0.003337,0.004258,0.004236,0.004074,0.002345,0.000967


In [7]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# Initialize kNN classifier
knn = KNeighborsClassifier(n_neighbors=10, metric="euclidean")

In [8]:
# Perform Stratified K-Fold cross-validation
fold_accuracies = []

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the kNN classifier
    knn.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    
    # Evaluate the model's performance
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)
    
    print(f"Fold {fold + 1}:")
    print(f"Train indices: {train_index}, Test indices: {test_index}")
    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 30)

# Summary of results
print(f"Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")

Fold 1:
Train indices: [  0   1   4   5   6   7   8   9  11  12  14  15  16  17  18  19  20  21
  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39
  40  41  42  43  44  45  46  47  48  49  50  51  53  54  55  56  57  58
  60  61  62  63  64  65  67  68  69  70  71  72  73  74  75  77  78  79
  80  81  82  83  84  85  86  87  88  89  91  92  93  94  95  96  97  98
  99 100 101 102 103 104 105 106 107 109 110 111 112 113 115 116 117 118
 120 121 123 124 125 126 127 128 129 130 131 132 133 134 136 137 138 139
 140 141 142 143 144 145 146 147 148 150 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 178
 179 180 181 183 184 185 187 188 189 192 193 194 196 197 198 199 200 201
 202 203 204 205 206 207 208 209 210 211 212 213 214 215 217 218 219 220
 221 222 223 224], Test indices: [  2   3  10  13  52  59  66  76  90 108 114 119 122 135 149 151 177 182
 186 190 191 195 216]
Accuracy: 0.4348
-----------------------------

In [9]:
scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [10]:
#print(scores1.mean(dtype=np.float64))
#print(scores2)
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(dtype=np.float64), scores2.std(dtype=np.float64)))
print(scores2)

5 fold: 0.49 accuracy with a standard deviation of 0.04
10 fold: 0.479447 accuracy with a standard deviation of 0.063063
[0.56521739 0.52173913 0.39130435 0.47826087 0.56521739 0.40909091
 0.45454545 0.40909091 0.54545455 0.45454545]


In [11]:
#y_predt = cross_val_predict(knn, x, y, cv=5) #10

In [12]:
data = pd.read_csv("../csv/15x15/classif_15x15_smsout.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_smsout.csv")

data.head()
#20x20 grid, monday(4/11) - smsout / friday(8/11)

Unnamed: 0.1,Unnamed: 0,cellid,smsout0,smsout1,smsout2,smsout3,smsout4,smsout5,smsout6,smsout7,...,smsout15,smsout16,smsout17,smsout18,smsout19,smsout20,smsout21,smsout22,smsout23,WHF
0,0,4379,0.000506,0.000249,1e-05,2e-06,4.8e-05,5e-06,0.000482,0.001283,...,0.003694,0.002877,0.002676,0.003814,0.002715,0.003396,0.003801,0.002788,0.001381,1
1,1,4380,0.001622,0.001365,0.010339,0.011154,0.010858,0.011243,0.00889,0.002497,...,0.004583,0.005636,0.007947,0.013844,0.006955,0.005852,0.005189,0.005335,0.002228,0
2,2,4381,0.001115,0.00037,0.000138,0.0,8e-06,8e-06,0.000418,0.002093,...,0.003074,0.002176,0.002187,0.003226,0.002198,0.003144,0.001724,0.00197,0.001457,1
3,3,4382,0.00187,0.001042,0.001751,0.0,0.000118,0.000118,0.001525,0.011887,...,0.017044,0.013721,0.014986,0.016102,0.012975,0.016939,0.014571,0.012702,0.00993,1
4,4,4383,0.002081,0.001223,0.001724,1.9e-05,0.000104,0.000402,0.001724,0.013061,...,0.017347,0.015103,0.015889,0.017413,0.014364,0.01791,0.015497,0.013267,0.010571,1


In [13]:
y = data.iloc[:,26] #h/w
x = data.iloc[:,2:26] #sms, calls, internet

In [14]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize kNN classifier
knn = KNeighborsClassifier(n_neighbors=10, metric="euclidean")
scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [15]:
# Perform Stratified K-Fold cross-validation
fold_accuracies = []

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the kNN classifier
    knn.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    
    # Evaluate the model's performance
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)
    
    print(f"Fold {fold + 1}:")
    print(f"Train indices: {train_index}, Test indices: {test_index}")
    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 30)

# Summary of results
print(f"Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")

Fold 1:
Train indices: [  0   1   4   5   6   7   8   9  11  12  14  15  16  17  18  19  20  21
  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39
  40  41  42  43  44  45  46  47  48  49  50  51  53  54  55  56  57  58
  60  61  62  63  64  65  67  68  69  70  71  72  73  74  75  77  78  79
  80  81  82  83  84  85  86  87  88  89  91  92  93  94  95  96  97  98
  99 100 101 102 103 104 105 106 107 109 110 111 112 113 115 116 117 118
 120 121 123 124 125 126 127 128 129 130 131 132 133 134 136 137 138 139
 140 141 142 143 144 145 146 147 148 150 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 178
 179 180 181 183 184 185 187 188 189 192 193 194 196 197 198 199 200 201
 202 203 204 205 206 207 208 209 210 211 212 213 214 215 217 218 219 220
 221 222 223 224], Test indices: [  2   3  10  13  52  59  66  76  90 108 114 119 122 135 149 151 177 182
 186 190 191 195 216]
Accuracy: 0.5217
-----------------------------

In [16]:
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(), scores2.std()))
print(scores2)

5 fold: 0.48 accuracy with a standard deviation of 0.02
10 fold: 0.494071 accuracy with a standard deviation of 0.068864
[0.39130435 0.39130435 0.47826087 0.52173913 0.52173913 0.45454545
 0.45454545 0.59090909 0.59090909 0.54545455]


In [17]:
#y_predt = cross_val_predict(knn, x, y, cv=5) #10

In [18]:
data = pd.read_csv("../csv/15x15/classif_15x15_callin.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_callin.csv")

data.head()
#20x20 grid, monday(4/11) - callin / friday(8/11)

Unnamed: 0.1,Unnamed: 0,cellid,callin0,callin1,callin2,callin3,callin4,callin5,callin6,callin7,...,callin15,callin16,callin17,callin18,callin19,callin20,callin21,callin22,callin23,WHF
0,0,4379,0.000204,0.000716,0.000118,0.0,0.0,0.000324,0.000149,0.002093,...,0.00915,0.013073,0.010728,0.009591,0.008018,0.004955,0.003183,0.001443,0.000482,1
1,1,4380,0.001311,0.000462,0.000277,0.0,0.000101,0.000388,0.000334,0.004638,...,0.014708,0.017623,0.016723,0.013522,0.013662,0.008128,0.004873,0.00261,0.001061,0
2,2,4381,0.000364,0.000249,0.0,0.0,0.0,0.000218,0.000499,0.004747,...,0.010988,0.011554,0.011801,0.008018,0.007783,0.00472,0.002418,0.000904,0.000619,1
3,3,4382,0.001501,0.000915,0.0,0.0,0.0,0.001501,0.002771,0.010943,...,0.031121,0.046033,0.046125,0.04856,0.024682,0.020695,0.009119,0.004223,0.004496,1
4,4,4383,0.001647,0.000863,1.4e-05,0.0,3.5e-05,0.00155,0.002651,0.011641,...,0.033412,0.046762,0.046555,0.051328,0.025946,0.021297,0.010219,0.004508,0.005045,1


In [19]:
y = data.iloc[:,26] #h/w
x = data.iloc[:,2:26] #sms, calls, internet

In [20]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize kNN classifier
knn = KNeighborsClassifier(n_neighbors=10, metric="euclidean")
scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [21]:
# Perform Stratified K-Fold cross-validation
fold_accuracies = []

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the kNN classifier
    knn.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    
    # Evaluate the model's performance
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)
    
    print(f"Fold {fold + 1}:")
    print(f"Train indices: {train_index}, Test indices: {test_index}")
    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 30)

# Summary of results
print(f"Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")

Fold 1:
Train indices: [  0   1   4   5   6   7   8   9  11  12  14  15  16  17  18  19  20  21
  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39
  40  41  42  43  44  45  46  47  48  49  50  51  53  54  55  56  57  58
  60  61  62  63  64  65  67  68  69  70  71  72  73  74  75  77  78  79
  80  81  82  83  84  85  86  87  88  89  91  92  93  94  95  96  97  98
  99 100 101 102 103 104 105 106 107 109 110 111 112 113 115 116 117 118
 120 121 123 124 125 126 127 128 129 130 131 132 133 134 136 137 138 139
 140 141 142 143 144 145 146 147 148 150 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 178
 179 180 181 183 184 185 187 188 189 192 193 194 196 197 198 199 200 201
 202 203 204 205 206 207 208 209 210 211 212 213 214 215 217 218 219 220
 221 222 223 224], Test indices: [  2   3  10  13  52  59  66  76  90 108 114 119 122 135 149 151 177 182
 186 190 191 195 216]
Accuracy: 0.4783
-----------------------------

In [22]:
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(dtype=np.float64), scores2.std(dtype=np.float64)))
print(scores2)

5 fold: 0.46 accuracy with a standard deviation of 0.05
10 fold: 0.449209 accuracy with a standard deviation of 0.084570
[0.47826087 0.52173913 0.43478261 0.43478261 0.30434783 0.59090909
 0.40909091 0.5        0.31818182 0.5       ]


In [23]:
#y_predt = cross_val_predict(knn, x, y, cv=5) #10

In [24]:
data2 = pd.read_csv("../csv/15x15/classif_15x15_callout.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_callout.csv")

data2.head()
#20x20 grid, monday(4/11) - callout / friday(8/11)

Unnamed: 0.1,Unnamed: 0,cellid,callout0,callout1,callout2,callout3,callout4,callout5,callout6,callout7,...,callout15,callout16,callout17,callout18,callout19,callout20,callout21,callout22,callout23,WHF
0,0,4379,1e-05,9.7e-05,0.0,0.0,5e-06,0.000152,0.000584,0.002842,...,0.009999,0.009711,0.011318,0.008949,0.005031,0.005754,0.002906,0.001478,0.0006,1
1,1,4380,0.000157,5.7e-05,0.0,9e-06,0.0,0.000237,0.000978,0.006365,...,0.017284,0.016388,0.019477,0.015358,0.011301,0.008685,0.005553,0.002499,0.000675,0
2,2,4381,0.000294,4e-05,2.3e-05,1.7e-05,0.0,0.000282,0.000605,0.006603,...,0.010329,0.010004,0.012635,0.010447,0.005459,0.004089,0.003268,0.001387,0.000201,1
3,3,4382,0.000525,0.000497,0.000245,0.000252,0.0,0.000538,0.001771,0.013803,...,0.034587,0.036961,0.052016,0.038642,0.024501,0.022786,0.009294,0.00922,0.002008,1
4,4,4383,0.000594,0.000537,0.000273,0.000308,0.0,0.000442,0.00183,0.014267,...,0.036211,0.037631,0.053166,0.038921,0.02418,0.022625,0.010503,0.009822,0.0021,1


In [25]:
y = data2.iloc[:,26] #h/w
x = data2.iloc[:,2:26] #sms, calls, internet

In [26]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize kNN classifier
knn = KNeighborsClassifier(n_neighbors=10, metric="euclidean")
scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [27]:
# Perform Stratified K-Fold cross-validation
fold_accuracies = []

predictions = np.zeros(len(y))  #Store the predictions

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the kNN classifier
    knn.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    
    # Evaluate the model's performance
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)
    
    # Store predictions in the correct positions
    predictions[test_index] = y_pred

    print(f"Fold {fold + 1}:")
    print(f"Train indices: {train_index}, Test indices: {test_index}")
    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 30)

# Summary of results
print(f"Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")

Fold 1:
Train indices: [  0   1   4   5   6   7   8   9  11  12  14  15  16  17  18  19  20  21
  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39
  40  41  42  43  44  45  46  47  48  49  50  51  53  54  55  56  57  58
  60  61  62  63  64  65  67  68  69  70  71  72  73  74  75  77  78  79
  80  81  82  83  84  85  86  87  88  89  91  92  93  94  95  96  97  98
  99 100 101 102 103 104 105 106 107 109 110 111 112 113 115 116 117 118
 120 121 123 124 125 126 127 128 129 130 131 132 133 134 136 137 138 139
 140 141 142 143 144 145 146 147 148 150 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 178
 179 180 181 183 184 185 187 188 189 192 193 194 196 197 198 199 200 201
 202 203 204 205 206 207 208 209 210 211 212 213 214 215 217 218 219 220
 221 222 223 224], Test indices: [  2   3  10  13  52  59  66  76  90 108 114 119 122 135 149 151 177 182
 186 190 191 195 216]
Accuracy: 0.4348
-----------------------------

In [28]:
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(dtype=np.float64), scores2.std(dtype=np.float64)))
print(scores2)

5 fold: 0.47 accuracy with a standard deviation of 0.07
10 fold: 0.497628 accuracy with a standard deviation of 0.053351
[0.52173913 0.47826087 0.52173913 0.56521739 0.43478261 0.5
 0.59090909 0.5        0.45454545 0.40909091]


In [29]:
y_predt = cross_val_predict(knn, x, y, cv=10) #10

In [30]:
data = pd.read_csv("../csv/15x15/classif_15x15_internet.csv")
#data = pd.read_csv("../csv/20x20/classif20x20frid_internet.csv")

data.head()
#20x20 grid, monday(4/11) - internet / friday(8/11)

Unnamed: 0.1,Unnamed: 0,cellid,internet0,internet1,internet2,internet3,internet4,internet5,internet6,internet7,...,internet15,internet16,internet17,internet18,internet19,internet20,internet21,internet22,internet23,WHF
0,0,4379,0.002042,0.001565,0.001452,0.00131,0.001929,0.001698,0.003383,0.005602,...,0.006198,0.006883,0.006717,0.007357,0.005671,0.005637,0.004365,0.003184,0.00243,1
1,1,4380,0.004017,0.003754,0.003254,0.002895,0.002966,0.003225,0.005059,0.009288,...,0.010094,0.011337,0.011814,0.011438,0.011108,0.010726,0.008973,0.007042,0.00587,0
2,2,4381,0.002189,0.001221,0.00098,0.000933,0.000887,0.001303,0.001957,0.003867,...,0.004349,0.005969,0.005831,0.005181,0.004924,0.005221,0.00485,0.002976,0.00242,1
3,3,4382,0.017824,0.010351,0.008671,0.009422,0.009445,0.010731,0.014947,0.02175,...,0.0224,0.023691,0.026365,0.027358,0.024565,0.02504,0.025541,0.019089,0.015443,1
4,4,4383,0.019145,0.010713,0.009025,0.009709,0.009752,0.010889,0.01516,0.023102,...,0.024713,0.025571,0.028881,0.029283,0.026267,0.02687,0.027343,0.020482,0.016373,1


In [31]:
y = data.iloc[:,26] #h/w
x = data.iloc[:,2:26] #sms, calls, internet

In [32]:
# Initialize StratifiedKFold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize kNN classifier
knn = KNeighborsClassifier(n_neighbors=10, metric="euclidean")
scores1 = cross_val_score(knn, x, y, cv=5)
scores2 = cross_val_score(knn, x, y, cv=10)

In [33]:
# Perform Stratified K-Fold cross-validation
fold_accuracies = []

for fold, (train_index, test_index) in enumerate(skf.split(x, y)):
    # Split data into train and test sets for this fold
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the kNN classifier
    knn.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = knn.predict(X_test)
    
    # Evaluate the model's performance
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)
    
    print(f"Fold {fold + 1}:")
    print(f"Train indices: {train_index}, Test indices: {test_index}")
    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 30)

# Summary of results
print(f"Mean accuracy + std over {n_splits} folds: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")

Fold 1:
Train indices: [  0   1   4   5   6   7   8   9  11  12  14  15  16  17  18  19  20  21
  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39
  40  41  42  43  44  45  46  47  48  49  50  51  53  54  55  56  57  58
  60  61  62  63  64  65  67  68  69  70  71  72  73  74  75  77  78  79
  80  81  82  83  84  85  86  87  88  89  91  92  93  94  95  96  97  98
  99 100 101 102 103 104 105 106 107 109 110 111 112 113 115 116 117 118
 120 121 123 124 125 126 127 128 129 130 131 132 133 134 136 137 138 139
 140 141 142 143 144 145 146 147 148 150 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 178
 179 180 181 183 184 185 187 188 189 192 193 194 196 197 198 199 200 201
 202 203 204 205 206 207 208 209 210 211 212 213 214 215 217 218 219 220
 221 222 223 224], Test indices: [  2   3  10  13  52  59  66  76  90 108 114 119 122 135 149 151 177 182
 186 190 191 195 216]
Accuracy: 0.6522
-----------------------------

In [34]:
print("5 fold: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))
print("10 fold: %f accuracy with a standard deviation of %f" % (scores2.mean(dtype=np.float64), scores2.std(dtype=np.float64)))
print(scores2)

5 fold: 0.46 accuracy with a standard deviation of 0.07
10 fold: 0.476877 accuracy with a standard deviation of 0.117940
[0.47826087 0.52173913 0.26086957 0.43478261 0.39130435 0.5
 0.68181818 0.36363636 0.5        0.63636364]


In [35]:
#y_predt = cross_val_predict(knn, x, y, cv=5) #10

In [36]:
predicted = pd.DataFrame(np.transpose(predictions).astype(int),columns=['predicted'])
predicted['cellid']=data2.cellid
predicted.head()

Unnamed: 0,predicted,cellid
0,1,4379
1,2,4380
2,1,4381
3,1,4382
4,1,4383


In [37]:
#predicted.to_csv('../csv/predicted_costrat_15x15.csv')