In [None]:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import warnings
import io

warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('TrainONMe-4.csv', index_col=0)
eval_data = pd.read_csv('EvaluateOnMe-4.csv', index_col=0)
train_data.dropna(inplace=True)

In [None]:
train_data.drop(train_data[train_data.x7 == 'chottis'].index, inplace=True)
train_data.drop(train_data[train_data.x7 == 'olka'].index, inplace=True)

In [None]:
train_data.drop(train_data.loc[train_data.x12 == "YEP True"].index[0], inplace=True)
train_data.drop(train_data.loc[train_data.x12 == "Nope False"].index[0], inplace=True)

In [None]:
train_data.replace(["?"], np.nan, inplace=True)
train_data.dropna(inplace=True)
train_data.x4 = train_data.x4.astype(float)

In [None]:
for i in train_data.index:
    try:
        int(i)
    except:
        train_data.drop(i, inplace=True)
train_data.index = train_data.index.astype(int)


In [None]:
zscore = np.abs(stats.zscore(train_data.select_dtypes(include=["float"])))
ZSCORE_THREASHOLD = 4

is_inlier = ~ (zscore > ZSCORE_THREASHOLD).any(axis=1)
train_data = train_data[is_inlier]

In [None]:
corrs = train_data.corr()
test_train_data = train_data.drop(["x1", "x2", "x6", "x13"], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

X = test_train_data.drop('y', axis=1)
y = train_data.y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)


In [None]:
numerical_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
    ('encoder', OrdinalEncoder())
])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components=7)),
])

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('forest', RandomForestClassifier(random_state=RANDOM_STATE))])
params = { 
 'forest__bootstrap': [True, False],
 'forest__max_depth': list(range(1, 30)) + [None],
 'forest__max_features': ['auto', 'sqrt', 'log2'],
 'forest__min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'forest__min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'forest__n_estimators': [200, 300, 400, 500, 600]
}

cv = StratifiedKFold(shuffle=True, random_state=RANDOM_STATE, n_splits=10)
rfr_random = RandomizedSearchCV(pipeline, param_distributions=params, n_iter = 1500, cv = cv, verbose=3, random_state=RANDOM_STATE, n_jobs=-1, return_train_score=True)
print("Fitting now")
rfr_random.fit(X, y)
rfr_random.best_score_

In [None]:
# from sklearn.model_selection import StratifiedKFold

# current_best_score = {}

# #from tqdm.notebook import tqdm, trange

# for depth in range(1, 30):
#     print(depth)
#     for i in range(1, 9):
#         from sklearn.feature_selection import SelectKBest, f_classif
#         from  sklearn.ensemble import RandomForestClassifier

#         pipeline = Pipeline(steps=[('preprocessor',
#                  ColumnTransformer(transformers=[('num',
#                                                   Pipeline(steps=[('imputer',
#                                                                    SimpleImputer(strategy='median')),
#                                                                   ('pca',
#                                                                    PCA(n_components=7))]),
#                                                   pd.Index(['x3', 'x4', 'x5', 'x8', 'x9', 'x10', 'x11'], dtype='object')),
#                                                  ('cat',
#                                                   Pipeline(steps=[('imputer',
#                                                                    SimpleImputer(fill_value='missing',
#                                                                                  strategy='constant')),
#                                                                   ('encoder',
#                                                                    OrdinalEncoder())]),
#                                                   pd.Index(['x7', 'x12'], dtype='object'))])),
#                 ('select', SelectKBest(f_classif, k=i)),
#                 ('forest',
#                  RandomForestClassifier(max_depth=depth, max_features='sqrt',
#                                         min_samples_split=3,
#                                         n_estimators=300))])

#         # 12 = depth -> RandomizedSearchedCV result.

#         # Depth 14 for n = 6 gives best accuracy and CV score.

#         # Test train split
#         from sklearn.model_selection import cross_val_score
#         model = pipeline.fit(X_train, y_train)
#         #model_score = model.score(X_test, y_test)
#         cv = StratifiedKFold(shuffle=True, n_splits=10)
#         cv_score = np.average(cross_val_score(pipeline, X_train, y_train, cv=cv, n_jobs=-1))
#         if len(current_best_score) == 0 or cv_score > current_best_score["CV Score"]:
#             current_best_score = {
#             "depth": depth,
#             "n": i,
#             "Accuracy Score": model.score(X_test, y_test),
#             "CV Score": cv_score
#             }

In [None]:
#current_best_score

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from  sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('pca',
                                                                   PCA(n_components=7))]),
                                                  pd.Index(['x3', 'x4', 'x5', 'x8', 'x9', 'x10', 'x11'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))]),
                                                  pd.Index(['x7', 'x12'], dtype='object'))])),
                ('select', SelectKBest(f_classif, k=6)),
                ('forest',
                 RandomForestClassifier(max_depth=13, max_features='sqrt',
                                        min_samples_split=3,
                                        n_estimators=200))])

In [None]:
eval_data = pd.read_csv('EvaluateOnMe-4.csv', index_col=0)
eval_data.x12 = eval_data.x12.astype(str)

In [None]:
model = pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, model.predict(X_test)))
pd.DataFrame(confusion_matrix(y_test, model.predict(X_test)), columns=["Atsuto", "Bob", "Jorg"], index=["Atsuto", "Bob", "Jorg"])

In [None]:
from sklearn.model_selection import cross_val_score

cv_list = []
for x in range(100):
    cv_list.append(cross_val_score(pipeline, X_train, y_train, n_jobs=-1).mean())

print(np.average(cv_list))

In [None]:
predictions = model.predict(eval_data)

In [None]:
with open("predictions.txt", "w") as f:
   for prediction in predictions:
       f.write(str(prediction) + "\n")