In [1]:
import pandas as pd
import numpy as np
import os
from zipfile import ZipFile
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score

In [2]:
os.listdir()

['.ipynb_checkpoints',
 'Just the Basics - Strata 2013 After-party.ipynb',
 'just-the-basics-the-after-party.zip',
 'test.csv',
 'train.csv',
 'train_labels.csv']

In [3]:
with ZipFile('just-the-basics-the-after-party.zip', 'r') as file:
    file.extractall()
    
os.listdir()

['.ipynb_checkpoints',
 'Just the Basics - Strata 2013 After-party.ipynb',
 'just-the-basics-the-after-party.zip',
 'test.csv',
 'train.csv',
 'train_labels.csv']

In [4]:
train = pd.read_csv("train.csv", header=None)
test = pd.read_csv("test.csv", header=None)
labels = pd.read_csv("train_labels.csv", header=None)

train.shape, test.shape, labels.shape

((600, 100), (4000, 100), (600, 1))

In [5]:
split_size = int(len(train)*0.9)

train_features = train[:split_size]
train_labels = labels[:split_size]

validation_features = train[split_size:]
validation_labels = labels[split_size:]

train_features.shape, validation_features.shape, train_labels.shape, validation_labels.shape

((540, 100), (60, 100), (540, 1), (60, 1))

In [6]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

train_features_cleaned = imputer.fit_transform(train_features)
validation_features_cleaned = imputer.fit_transform(validation_features)

In [7]:
pd.DataFrame([[train_features_cleaned, validation_features_cleaned]]).isnull().sum()

0    0
1    0
dtype: int64

In [8]:
pd.DataFrame(validation_features_cleaned).isnull().sum()

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_features_scaled = scaler.fit_transform(train_features_cleaned)
validation_features_scaled = scaler.fit_transform(validation_features_cleaned)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

predictions_dict = dict()
scores_dict = dict()

class_models = [LogisticRegression(random_state=3), 
                DecisionTreeClassifier(random_state=3), 
                RandomForestClassifier(random_state=3)]


for class_model in class_models:
    class_model.fit(train_features_scaled, train_labels.values)
    class_model_name = f"{str(class_model).replace(r'(random_state=3)', '')}"
    predictions_dict[class_model_name] = list(class_model.predict(validation_features_scaled))
    scores_dict[class_model_name] = accuracy_score(validation_labels, predictions_dict[class_model_name])
    print(f"{class_model_name}: {round(scores_dict[class_model_name] * 100, 2)}%")

LogisticRegression: 81.67%
DecisionTreeClassifier: 83.33%
RandomForestClassifier: 81.67%


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.nn import relu, sigmoid
from tensorflow.keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

model = Sequential([
    Flatten(),
    Dense(32, activation=relu),
    Dropout(0.2),
    Dense(64, activation=relu),
    Dense(1, activation=sigmoid)
])

model.compile(loss='binary_crossentropy',
             optimizer=Adam(),
             metrics=['accuracy'])

model.fit(train_features_scaled, 
          train_labels.values,
          epochs=100,
          validation_data=(validation_features_scaled, validation_labels), 
          verbose=0)

nn_predictions_proba = model.predict(validation_features_scaled)
nn_predictions = list(map(lambda proba: 1 if proba >= 0.5 else 0, nn_predictions_proba))
nn_score = accuracy_score(validation_labels, nn_predictions)
predictions_dict["Neural Network"] = nn_predictions
scores_dict["Neural Network"] = nn_score
print(f"Accuracy score: {round(nn_score * 100, 2)}%")

Accuracy score: 83.33%


In [13]:
pd.DataFrame(predictions_dict)

Unnamed: 0,LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,Neural Network
0,1,1,1,1
1,0,0,0,0
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,0,1,1
6,1,1,1,1
7,0,0,0,0
8,0,0,0,0
9,0,0,1,1


In [14]:
pd.DataFrame({'% Accuracy':[round(score *100, 2) for score in scores_dict.values()]}, index=scores_dict.keys())

Unnamed: 0,% Accuracy
LogisticRegression,81.67
DecisionTreeClassifier,83.33
RandomForestClassifier,81.67
Neural Network,83.33
