# 1. Make classifier for MNIST with 97% accuracy

In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import pandas as pd

In [2]:
mnist = fetch_openml('mnist_784', version=1)

In [16]:
# split into training and test sets

X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [4]:
# try KNeighborsClassifier w/ grid search on weights and n_neighbors

In [5]:
# THIS TAKES A LONG TIME TO RUN!
# uncomment if you have time
'''
pipeline_kn = Pipeline(steps = [("kn_class", KNeighborsClassifier())])

param_grid = [
    {'kn_class__n_neighbors': [1,2,3], 'kn_class__weights': ["uniform", "distance"]}
]

grid_search = GridSearchCV(pipeline_kn, param_grid, cv=5,
                        scoring="accuracy",
                        return_train_score=True)

grid_search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)
'''

'\npipeline_kn = Pipeline(steps = [("kn_class", KNeighborsClassifier())])\n\nparam_grid = [\n    {\'kn_class__n_neighbors\': [1,2,3], \'kn_class__weights\': ["uniform", "distance"]}\n]\n\ngrid_search = GridSearchCV(pipeline_kn, param_grid, cv=5,\n                        scoring="accuracy",\n                        return_train_score=True)\n\ngrid_search.fit(X_train, y_train)\n\nprint("Best parameter (CV score=%0.3f):" % grid_search.best_score_)\nprint(grid_search.best_params_)\n'

In [6]:
# Solution to the above was 
'''
Best parameter (CV score=0.971):
{'kn_class__n_neighbors': 3, 'kn_class__weights': 'distance'}
'''

"\nBest parameter (CV score=0.971):\n{'kn_class__n_neighbors': 3, 'kn_class__weights': 'distance'}\n"

# 2. Test augmented data by shifting images in different directions

In [2]:
from scipy.ndimage.interpolation import shift
import matplotlib.pyplot as plt
import numpy as np

In [27]:
all_digits_images = X.iloc[:].values.reshape(70000, 28, 28)
X_train_digits_images = X_train.iloc[:].values.reshape(60000, 28, 28)

In [29]:
X_train_digits_labels = np.concatenate((y_train,y_train,y_train,y_train,y_train)) # the labels should still be the same

In [32]:
# make new datasets with shifts [0,del_y,del_x]

X_train_images_no_shift = shift(X_train_digits_images, [0,0,0], cval=0)
X_train_images_shifted_xp1 = shift(X_train_digits_images, [0,0,1], cval=0)
X_train_images_shifted_xm1 = shift(X_train_digits_images, [0,0,-1], cval=0)
X_train_images_shifted_yp1 = shift(X_train_digits_images, [0,1,0], cval=0)
X_train_images_shifted_ym1 = shift(X_train_digits_images, [0,-1,0], cval=0)

In [33]:
# concatenate everything

X_train_augmented = np.concatenate((X_train_images_no_shift,
                                    X_train_images_shifted_xp1,
                                    X_train_images_shifted_xm1,
                                    X_train_images_shifted_yp1,
                                    X_train_images_shifted_ym1), axis=0)

In [35]:
# flatten again

X_train_augmented = X_train_augmented.reshape(300000,784)

In [37]:
# redo grid search as before (but with smaller grid to save time)

# THIS TAKES A LONG TIME TO RUN! I GAVE UP ON THIS AFTER 1/2 HOUR OR SO
# uncomment if you have time

pipeline_kn_augmented = Pipeline(steps = [("kn_class", KNeighborsClassifier())])

param_grid = [
    {'kn_class__n_neighbors': [3], 'kn_class__weights': ["distance"]}
]

grid_search = GridSearchCV(pipeline_kn_augmented, param_grid, cv=5,
                        scoring="accuracy",
                        return_train_score=True)

grid_search.fit(X_train_augmented, X_train_digits_labels)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)


KeyboardInterrupt: 

# 3. Tackle the Titanic dataset

In [3]:
from sklearn.preprocessing import OrdinalEncoder

In [4]:
df_titanic_train = pd.read_csv("./data/titanic_train.csv") # length 891
df_titanic_test = pd.read_csv("./data/titanic_test.csv") # length 418

In [5]:
# concatenate first, so that encoding and transformations are the same

#df_titanic_all = pd.concat((df_titanic_test,df_titanic_train))

In [6]:
# drop rows with nans in important columns

df_titanic_train_dropped = df_titanic_train.dropna(axis=0, how="any", subset=["Survived",
                                                                      "Sex","Age","SibSp",
                                                                      "Parch","Ticket","Fare",
                                                                      "Cabin","Embarked"])

In [11]:
 ordinal_encoder.fit_transform(df_titanic_train_dropped[["Sex"]])

array([[0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],

In [12]:
# encode strings, classes, etc.

# handle_unknown enables replacement of NaNs with a number
df_titanic_train_dropped.reset_index(drop=True, inplace=True)
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
df_titanic_train_dropped.loc[:,"Sex_encoded"] = ordinal_encoder.fit_transform(df_titanic_train_dropped[["Sex"]])
print(ordinal_encoder.categories_)
print("----")
df_titanic_train_dropped.loc[:,"Embarked_encoded"] = ordinal_encoder.fit_transform(df_titanic_train_dropped[["Embarked"]])
print(ordinal_encoder.categories_)

[array(['female', 'male'], dtype=object)]
----
[array(['C', 'Q', 'S'], dtype=object)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [40]:
df_titanic_train["Embarked"].iloc["Embarked" != np.str]

TypeError: Cannot index by location index with a non-integer key

In [26]:
df_titanic_train["Sex_encoded"]

0      1.0
1      0.0
2      0.0
3      0.0
4      1.0
      ... 
886    1.0
887    0.0
888    0.0
889    1.0
890    1.0
Name: Sex_encoded, Length: 891, dtype: float64

In [27]:
df_titanic_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encoded
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1.0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0.0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1.0


In [None]:
# define targets

X, y = df_titanic_train["data"], df_titanic_train["Survived"]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [15]:
# try a grid search like further above

'''
pipeline_kn = Pipeline(steps = [("kn_class", KNeighborsClassifier())])

param_grid = [
    {'kn_class__n_neighbors': [1,2,3], 'kn_class__weights': ["uniform", "distance"]}
]

grid_search = GridSearchCV(pipeline_kn, param_grid, cv=5,
                        scoring="accuracy",
                        return_train_score=True)

grid_search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
print(grid_search.best_params_)
'''

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Build a spam classifier

In [4]:
import glob

In [23]:
# make lists of spam and ham files

spam_stem = "/Users/bandari/Documents/git.repos/ml/datasets/"
list_spam_all = glob.glob(spam_stem + "spam/" + "0*.*")
list_ham_all = glob.glob(spam_stem + "easy_ham/" + "*")

In [25]:
list_ham_all

['/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/2170.78c282a5e417d6d231dc75aa8588ebb7',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/0461.7609029b07acf65f40bd93882aa35887',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/1077.5967a2892063a3470f32596403b095aa',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/1000.348e8a82897b05d584de019f4dd5c7e2',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/2125.ee1e93366c2199584cfbd4cbb353d918',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/0401.9ab0bcc81e70d6930acfefc2854a0172',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/0502.9e78d7d003ceeef52c70ac7e7b786ef9',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/1561.b968a0929d29009dbb76603323c1862f',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/0634.2da7e83d50baee90c3c9defd2aa30b72',
 '/Users/bandari/Documents/git.repos/ml/datasets/easy_ham/1005.aaca009bb6cf75da8456c570bad6ad08',
 '/Users/bandari/Doc

In [None]:
# make a feature vector 

'''
HAM/SPAM:
Content-Type: text/plain OR text/html         [0/1]
From: if name in quotes / if plain email only [0/1]
number of occurrences of word "Herbal"        [N]
number of occurrences of word "buy"           [N]
'''

In [32]:
# TEST: detect a string in all this

with open(list_spam_all[0], 'rb') as f:
    contents = f.read()
    print("102192086381143" in str(contents))

True
