In [None]:
%pip install --upgrade --quiet neptune-client

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import neptune.new as neptune
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [14]:
#Load datasets
def load_data() -> list:
    X_train_ = pd.read_csv('train_data.csv', header=None)
    y_labels = pd.read_csv('train_labels.csv', header=None)
    X_test_ = pd.read_csv('test_data.csv', header=None)
    return[X_train_, y_labels, X_test_]
X, y, X_test_data = load_data()

In [15]:
#Split training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
#Run neptune.ai
run = neptune.init(project='ml_cdv/predict-labels',
                   api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIxYWJkNjhjMS04MzYyLTQ2ZDktOGYwZS03NDFhOWM0MjUzYjIifQ==') # your credentials
run

Your trial will end in 1 day. If you want to continue tracking your team experiments in Neptune, upgrade your plan (https://app.neptune.ai/o/ml_cdv/-/subscription)
psutil is not installed. Hardware metrics will not be collected.
https://app.neptune.ai/ml_cdv/predict-labels/e/PRED-4
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


<neptune.new.run.Run at 0x7f50c2223c90>

In [17]:
#Function with model, score and confusion matrix
def model(x_1, x_2, y_1, y_2, save: bool = False):

    pipe = Pipeline([('pca', PCA(n_components = 0.95)), ('scaler', MinMaxScaler()),('classifier', SVC())])

    params = [
        {'scaler': [MinMaxScaler()],
        'pca': [PCA(n_components = 0.95)]},
        {"classifier": [LogisticRegression(random_state=42)],
        "classifier__penalty": ["l2"],
        "classifier__C": np.logspace(0.001, 0.1, 10),
        'classifier__class_weight': ['balanced'],
        "classifier__solver": ["liblinear"]
        },

        {'classifier': [SVC(random_state=42)],
        'classifier__kernel': ['linear', 'poly'],
        'classifier__class_weight': ['balanced'],
        'classifier__C': np.logspace(1,2,5)},
        
        {"classifier": [RandomForestClassifier(random_state=42)],
        "classifier__n_estimators": [100, 120, 300, 500, 800, 1200],
        "classifier__max_features": ['log2', 'sqrt', 'auto', None],
        "classifier__max_depth": [5, 8, 15, 25, 30, None],
        "classifier__min_samples_split": [1,2,5,10,15,100],
        "classifier__min_samples_leaf": [1,2,5,10]
        }]

    randsearch = RandomizedSearchCV(pipe,
                              params,
                              cv=2,
                              verbose=1,
                              n_jobs=-1,
                              scoring='f1_micro')
    
    best_model = randsearch.fit(x_2, y_2.values.ravel())
    y_pred = best_model.predict(x_2)
    print(f"\nBest model params: \n{best_model.best_params_}")
    print(f"\nModel scorer: \n{best_model.scorer_}")
    print(f"\nModel score: \n{best_model.best_score_}")
    print(confusion_matrix(y_2, y_pred))

    if save:
        filename = "mf_model.pkl"
        joblib.dump(best_model, filename)

In [18]:
#Run predicting function and save file with model
y_pred = model(X_train, X_test, y_train, y_test, save=True)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
 0.9006462         nan 0.9006462         nan]

Best model params: 
{'classifier__n_estimators': 500, 'classifier__min_samples_split': 15, 'classifier__min_samples_leaf': 10, 'classifier__max_features': None, 'classifier__max_depth': 30, 'classifier': RandomForestClassifier(max_depth=30, max_features=None, min_samples_leaf=10,
                       min_samples_split=15, n_estimators=500, random_state=42)}

Model scorer: 
make_scorer(f1_score, pos_label=None, average=micro)

Model score: 
0.9200323101777059
[[ 106   17]
 [  15 1100]]


In [19]:
#Stop neptune.ai
run.stop()

Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!


In [39]:
#Load file with saved model
def load_model(pred):
    
    filename = "mf_model.pkl"
    loaded_model = joblib.load(filename)

    labels = loaded_model.predict(pred)
    
    return labels

In [40]:
#Predict labels with testing data
test_labels = load_model(X_test_data)

In [42]:
#Save dataframe with predicted labels to csv file
test_labels_df = pd.DataFrame(test_labels)
test_labels_df.to_csv("test_labels.csv")

In [43]:
#Read dataframe with predicted labels
test_labels_df

Unnamed: 0,0
0,-1
1,1
2,1
3,1
4,-1
...,...
1245,1
1246,1
1247,1
1248,1


In [44]:
#Count values of dataframe with predicted labels
test_labels_df.value_counts()

 1    1096
-1     154
dtype: int64

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ae6d556f-9829-4201-b0d0-4c143a3266cf' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>