In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
test_data = pd.read_csv(r'.\Data\testing.csv')
train_data = pd.read_csv(r'.\Data\training.csv')

## Some EDA

In [None]:
train_data.info()

In [None]:
# Columns with NA's
train_data.isna().sum()[train_data.isna().sum() > 0]

In [None]:
print(f"Id is Monotonic -> {train_data['id'].is_monotonic}")
print(f"Id is Unique -> {train_data['id'].is_unique}")

In [None]:
train_data['Kingdom'].value_counts()

In [None]:
sns.histplot(data=train_data,
            x='Kingdom',
            binwidth=1)

In [None]:
train_data['DNAtype'].value_counts()

In [None]:
sns.histplot(data=train_data,
            x='DNAtype',
            binwidth=1)

In [None]:
train_data.groupby(by=['Kingdom', 'DNAtype'])['id'].count()

In [None]:
sns.histplot(data=train_data,
            x='Kingdom',
            binwidth=1,
            hue='DNAtype')

In [None]:
train_data[train_data['DNAtype'].isna()]

In [None]:
train_data['SpeciesID'].value_counts()

In [None]:
train_data['SpeciesID'].value_counts().value_counts()

In [None]:
train_data[train_data['SpeciesID'] == 3702]

In [None]:
# Checking to see if species with the same speciesID have the same Kingdom
train_data.groupby(by='SpeciesID').agg({'Kingdom': pd.Series.nunique}).value_counts()
# Species with the same speciesID do indeed have the same kingdom

In [None]:
train_data['Ncodons'].value_counts()

In [None]:
train_data['Ncodons'].value_counts().value_counts().plot(kind='bar')

In [None]:
train_data[train_data['Ncodons'] == 1140].head()

In [None]:
train_data.groupby(by='Ncodons').agg({'Kingdom': pd.Series.nunique}).value_counts()
# Species with the same Ncodons do not have the same kingdom

In [None]:
def assert_float(x):
    try:
        x = float(x)
        return True
    except:
        return False

In [None]:
train_data[~train_data['UUC'].apply(assert_float)]

In [None]:
train_data[~train_data['UUU'].apply(assert_float)]

In [None]:
train_data['SpeciesName'].str.split(' ').apply(lambda x: x[0]).value_counts()
#top_20_keywords = train_data['SpeciesName'].str.split(' ').explode().value_counts().head(20)

## Drop NA's and non-floats for corr

In [None]:
train_corr = train_data[train_data['UUU'].apply(assert_float)].dropna().reset_index(drop=True)

In [None]:
sns.set(rc={'figure.figsize':(15,15)})
sns.heatmap(train_corr.corr(), cmap=sns.diverging_palette(0, 255, sep=8, n=256))

In [None]:
# Sum up all the normalized codons
train_corr.iloc[:,7:].sum(axis=0)

## Build Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, log_loss, f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.inspection import permutation_importance

In [None]:
def get_text_words(dataframe, keywords):
    df = dataframe.copy()
    for word in keywords:
        df[word] = df['SpeciesName'].str.contains(word, case=False).replace({True:1, False:0})
    
    return df    

In [None]:
training_data = get_text_words(train_corr, ['chloroplast', 'virus', 'human', 'bacteria'])

In [None]:
train_features = training_data.drop(columns=['id', 'SpeciesID', 'SpeciesName', 'Kingdom']).copy()
train_labels = training_data['Kingdom'].copy()

In [None]:
print(train_features.shape, train_labels.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features,
                                                    train_labels,
                                                    stratify=train_labels,
                                                    random_state=42)

In [None]:
rnd_clf = RandomForestClassifier(random_state=42)

In [None]:
rnd_clf.fit(X_train, y_train)

In [None]:
scores = cross_val_score(rnd_clf, 
                         X_train, 
                         y_train,
                         scoring="accuracy", 
                         cv=5,
                         n_jobs = -1)

print(scores, f'Mean: {np.mean(scores)}', f'Standard Deviation: {np.std(scores)}')

In [None]:
accuracy_score(rnd_clf.predict(X_test), y_test)

In [None]:
f1_score(rnd_clf.predict(X_test), y_test, average='micro'), f1_score(rnd_clf.predict(X_test), y_test, average='macro')

## Feature Importance

In [None]:
# Taking a look at the most important features
std = np.std([tree.feature_importances_ for tree in rnd_clf.estimators_], axis=0)
pd.Series(rnd_clf.feature_importances_, index=X_train.columns).plot.bar(yerr=std)

In [None]:
result = permutation_importance(
            rnd_clf, 
            X_train,
            y_train,
            n_repeats=10, 
            random_state=42, 
            n_jobs=-1
)

In [None]:
pd.Series(result.importances_mean, index=X_train.columns).plot.bar(yerr=result.importances_std)

## Tune Model

In [None]:
rnd_clf.get_params()

In [None]:
param_grid = [{'n_estimators': np.linspace(100, 1000, 5, dtype=int),
               'max_depth': np.linspace(100, 1000, 5, dtype=int),
               'max_leaf_nodes': np.linspace(20, 50, 5, dtype=int),
               'min_samples_leaf': np.linspace(2, 5, 3, dtype=int),
               'min_samples_split': np.linspace(4, 12, 3, dtype=int),
            }]

rnd_clf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(rnd_clf,
                                   param_grid, 
                                   cv=3,
                                   scoring='accuracy',
                                   n_jobs = -1,
                                   return_train_score=True)

random_search.fit(X_train, y_train)
random_search.best_params_

In [None]:
cvres = random_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sum(mean_score), params)

In [None]:
param_grid = [{'n_estimators': np.linspace(200, 500, 3, dtype=int),
               'min_samples_leaf': np.linspace(1, 3, 2, dtype=int),
               'min_samples_split': np.linspace(2, 6, 2, dtype=int),
            }]

rnd_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(rnd_clf,
                           param_grid, 
                           cv=3,
                           scoring='accuracy',
                           n_jobs = -1,
                           return_train_score=True)

grid_search.fit(X_train, y_train)
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.max(mean_score), params)

In [None]:
model_of_choice = grid_search.best_estimator_
model_of_choice.fit(X_train, y_train)

In [None]:
scores = cross_val_score(model_of_choice, 
                         X_train, 
                         y_train,
                         scoring="accuracy", 
                         cv=5,
                         n_jobs = -1)

print(scores, f'Mean: {np.mean(scores)}', f'Standard Deviation: {np.std(scores)}')

## Gradient Boost Attempt

In [None]:
gbt_clf = GradientBoostingClassifier(random_state=42)

In [None]:
gbt_clf.fit(X_train, y_train)

In [None]:
scores = cross_val_score(gbt_clf, 
                         X_train, 
                         y_train,
                         scoring="accuracy", 
                         cv=3,
                         n_jobs = -1)

print(scores, f'Mean: {np.mean(scores)}', f'Standard Deviation: {np.std(scores)}')

In [None]:
param_grid = [{'learning_rate': np.linspace(0.01, 1, 2),
                'n_estimators': np.linspace(100, 1000, 3, dtype=int),
            }]

gbt_clf = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(gbt_clf,
                           param_grid, 
                           cv=3,
                           scoring='accuracy',
                           n_jobs = -1,
                           return_train_score=True)

grid_search.fit(X_train, y_train)
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.max(mean_score), params)

In [None]:
model_of_choice = grid_search.best_estimator_
model_of_choice.fit(X_train, y_train)

In [None]:
scores = cross_val_score(model_of_choice, 
                         X_train, 
                         y_train,
                         scoring="accuracy", 
                         cv=5,
                         n_jobs = -1)

print(scores, f'Mean: {np.mean(scores)}', f'Standard Deviation: {np.std(scores)}')

## Submission

In [None]:
testing_data = get_text_words(test_data, ['chloroplast', 'virus', 'human', 'bacteria'])

In [None]:
test_data['Kingdom'] = model_of_choice.predict(testing_data.drop(columns=['id', 'SpeciesID', 'SpeciesName']))

In [None]:
train_id_kingdom = train_data[['SpeciesID', 'Kingdom']].drop_duplicates()

In [None]:
final_pred = test_data.merge(train_id_kingdom, left_on='SpeciesID', right_on='SpeciesID', how='left')

In [None]:
final_pred['Kingdom'] = final_pred.Kingdom_y.combine_first(final_pred.Kingdom_x)

In [None]:
final_pred[['id', 'Kingdom']].to_csv('Predictions.csv', index=False)

In [None]:
train_id_kingdom[train_id_kingdom['SpeciesID'] == 5888]

In [None]:
final_pred[(final_pred['Kingdom_x'] != final_pred['Kingdom_y']) &
          ~final_pred['Kingdom_y'].isna()]