# Kaggle Classification


## Libraries

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.exceptions import ConvergenceWarning
import warnings

from scipy.stats import multivariate_normal

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer,accuracy_score ,precision_score, recall_score
from sklearn.metrics import f1_score,confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline


from functions import *

from utility import read_all_csvs_one_test
from utility import read_all_test_data_from_path
from utility import run_cv_one_motor
from utility import extract_selected_feature, prepare_sliding_window

### Read data and pre-process

In [21]:
n_int = 20

utility_path = '../'
sys.path.insert(1, utility_path)

# Subfunction for data preprocessing.
def pre_processing(df: pd.DataFrame):
    ''' ### Description
    Preprocess the data:
    - remove outliers
    - add new features about the difference between the current and previous n data point.
    - Use ffil function to replace the invalid measurement with the previous value
    '''
    
    def remove_outliers(df: pd.DataFrame):
        ''' # Description
        Remove outliers from the dataframe based on defined valid ranges. 
        Define a valid range of temperature and voltage. 
        Use ffil function to replace the invalid measurement with the previous value.
        '''
        df['temperature'] = df['temperature'].where(df['temperature'] <= 100, np.nan)
        df['temperature'] = df['temperature'].where(df['temperature'] >= 0, np.nan)
        df['temperature'] = df['temperature'].ffill()        

        df['voltage'] = df['voltage'].where(df['voltage'] >= 6000, np.nan)
        df['voltage'] = df['voltage'].where(df['voltage'] <= 9000, np.nan)
        df['voltage'] = df['voltage'].ffill()       

        df['position'] = df['position'].where(df['position'] >= 0, np.nan)
        df['position'] = df['position'].where(df['position'] <= 1000, np.nan)
        df['position'] = df['position'].ffill()
        
    def remove_seq_variability(df: pd.DataFrame):
        ''' # Description
        Remove the sequence-to-sequence variability.
        '''
        # Tranform the features relative to the first data point.
        df['temperature'] = df['temperature'] - df['temperature'].iloc[0]
        df['voltage'] = df['voltage'] - df['voltage'].iloc[0]
        df['position'] = df['position'] - df['position'].iloc[0]

    def cal_diff(df: pd.DataFrame, n_int: int):
        ''' # Description
        Calculate the difference between the current and previous n data point.
        '''

        # Calculate the difference between the current and previous n data point.
        df['temperature_diff'] = df['temperature'].diff(n_int)
        df['voltage_diff'] = df['voltage'].diff(n_int)
        df['position_diff'] = df['position'].diff(n_int)   

    # Start processing.
    remove_outliers(df)
    remove_seq_variability(df)
    #cal_diff(df, n_int)
    
label_columns = ['data_motor_1_label', 'data_motor_2_label', 'data_motor_3_label', 'data_motor_4_label', 'data_motor_5_label', 'data_motor_6_label']
    
# Read all the training dataset.
base_dictionary = '../../dataset/training_data/'
df_data = read_all_test_data_from_path(base_dictionary, pre_processing, is_plot=False)

# Smooth the data.
smoothed_data = df_data.copy(deep=True)

for i in range(1,7):
    smoothed_data[f'data_motor_{i}_voltage'] = smooth_data_moving_average(smoothed_data[f'data_motor_{i}_voltage'], 10)

Read Test data

In [22]:
base_dictionary = '../../dataset/testing_data/'
df_test = read_all_test_data_from_path(base_dictionary, pre_processing, is_plot=False)


for i in range(1,7):
    df_test[f'data_motor_{i}_voltage'] = smooth_data_moving_average(df_test[f'data_motor_{i}_voltage'], 10)

Only uncomment the following code if you want to add the call_diff function to preprocessing : 

In [None]:
'''
base_dictionary = '../../dataset/testing_data/'
# Get all the folders in the base_dictionary
path_list = os.listdir(base_dictionary)
# Only keep the folders, not the excel file.
path_list_sorted = sorted(path_list)
path_list = path_list_sorted[:-1]

# Read the data.
df_test = pd.DataFrame()
for tmp_path in path_list:
    path = base_dictionary + tmp_path
    #tmp_df = read_all_csvs_one_test(path, tmp_path, pre_processing)
    
    ### ------------read_all_csvs_one_test --------------
    
    # Get a list of all CSV files in the folder
    csv_files = [file for file in os.listdir(path) if file.endswith('.csv')]

    # Create an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()

    # Iterate over the CSV files in the folder
    for file in csv_files:
        # Construct the full path to each CSV file
        file_path = os.path.join(path, file)

        # Read each CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Drop the time. Will add later.
        df = df.drop(labels=df.columns[0], axis=1)

        # Apply the pre-processing.
        if pre_processing:
            pre_processing(df)

        # Extract the file name (excluding the extension) to use as a prefix
        file_name = os.path.splitext(file)[0]

        # Add a prefix to each column based on the file name
        df = df.add_prefix(f'{file_name}_')

        # Concatenate the current DataFrame with the combined DataFrame
        combined_df = pd.concat([combined_df, df], axis=1)

    # Add time and test condition
    df = pd.read_csv(file_path)
    combined_df = pd.concat([df['time'], combined_df], axis=1)

    # Calculate the time difference since the first row
    time_since_first_row = combined_df['time'] - combined_df['time'].iloc[0]
    # Replace the 'time' column with the time difference
    combined_df['time'] = time_since_first_row

    combined_df.loc[:, 'test_condition'] = tmp_path

    combined_df.drop(columns=label_columns, inplace= True)
    
    # Drop the NaN values, which represents the first n data points in the original dataframe.
    #combined_df.dropna(inplace=True)

    tmp_df = combined_df
    
    ### --------------------------------------------
    
    df_test = pd.concat([df_test, tmp_df])
    df_test = df_test.reset_index(drop=True)

# Read the test conditions
df_test_conditions = pd.read_excel(base_dictionary+'Test conditions.xlsx')

# Smooth the data.

#df_test.drop(columns=['time','test_condition'], inplace=True)

for i in range(1,7):
    df_test[f'data_motor_{i}_voltage'] = smooth_data_moving_average(df_test[f'data_motor_{i}_voltage'], 10)

'''

Specify the test conditions you would like to include in the training.

In [23]:
#df_data_experiment = smoothed_data[smoothed_data['test_condition'].isin(['20240425_093699', '20240425_094425', '20240426_140055',
                                                       #'20240503_164675', '20240503_165189',
                                                       #'20240503_163963', '20240325_155003'])]


# normal
normal_test_id = ['20240105_164214',
    '20240105_165300',
    '20240105_165972',
    '20240320_152031',
    '20240320_153841',
    '20240320_155664',
    '20240321_122650',
    '20240325_135213',
    '20240325_152902',
    '20240426_141190',
    '20240426_141532',
    '20240426_141602',
    '20240426_141726',
    '20240426_141938',
    '20240426_141980',
    '20240503_163963',
    '20240503_164435',
    '20240503_164675',
    '20240503_165189']

normal_test_id = ['20240105_164214',
    '20240105_165300',
    '20240105_165972',
    '20240320_152031',
    '20240320_153841',
    '20240320_155664',
    '20240321_122650',
    '20240325_135213',
    '20240325_152902',
    '20240325_155003',
    '20240425_093699',
    '20240425_094425',
    '20240426_140055',
    '20240426_141190',
    '20240426_141532',
    '20240426_141602',
    '20240426_141726',
    '20240426_141938',
    '20240426_141980',
    '20240503_163963',
    '20240503_164435',
    '20240503_164675',
    '20240503_165189',
    '20240529_122361',
    '20240529_122994',
    '20240529_123223',
    '20240529_123430',
    '20240529_124333',
    '20240529_125896',
    '20240529_130680',
    '20240529_131085',
    '20240529_131373',
    '20240529_131558',
    '20240529_131755',
    '20240529_132509',
    '20240529_133879'
]


df_data_experiment = smoothed_data[smoothed_data['test_condition'].isin(normal_test_id)]

### Cross validation - Only for evaluation, not for submit Prediction

In [10]:
def run_all_motors(motor_label, drop_list):
    X = df_data_experiment.drop(columns=label_columns+drop_list)
    y = df_data_experiment[motor_label]
    
    #X_train, y_train, X_test , y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    warnings.filterwarnings('ignore')

    # Initialize models
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced'),
        'Decision Tree': DecisionTreeClassifier(class_weight='balanced'),
        'Random Forest': RandomForestClassifier(class_weight='balanced'),
        'Support Vector Machine': SVC(class_weight='balanced'),
        'Gradient Boosting': GradientBoostingClassifier()
    }

    # Dictionary to store model performance metrics
    model_metrics = {}

    # Define hyperparameter grids
    param_grids = {
        'Logistic Regression': {'C': [0.1, 1, 10]},
        'Decision Tree': {'max_depth': [None, 10, 20]},
        'Random Forest': {'n_estimators': [50, 100, 200]},
        'Support Vector Machine': {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']},
        'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
    }

    model_predictions = {}

    # Perform cross-validation, hyperparameter tuning, and evaluation
    for model_name, model in models.items():
        
        # Hyperparameter tuning
        grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='f1')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        
        # Evaluate on test set
        y_pred = best_model.predict(X_test)
        
        model_predictions[f'y_pred_{model_name.replace(" ", "_")}'] = y_pred
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Store metrics in the dictionary
        model_metrics[model_name] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }
        
    # Update the summary table with the model performance metrics
    summary_table = "| Model                    | Accuracy | Precision | Recall | F1    |\n"
    summary_table += "|--------------------------|----------|-----------|--------|-------|\n"
    for model_name, metrics in model_metrics.items():
        summary_table += f"| {model_name:25} | {metrics['Accuracy']*100:.2f}%   | {metrics['Precision']*100:.2f}%   | {metrics['Recall']*100:.2f}%  | {metrics['F1']*100:.2f}% |\n"

    print(summary_table)        
    
    return model_predictions

### Apply model on testing data for submit Prediction

In [27]:
def run_all_motors_validation(motor_label, drop_list):
        
    feature_list_all = df_data_experiment.drop(columns=drop_list + ["test_condition"] + label_columns).columns.tolist()

    # Extract the features.
    df_tr_x, df_tr_y = extract_selected_feature(df_data_experiment, feature_list_all, motor_label, mdl_type='clf')

    # Prepare the training data based on the defined sliding window.
    window_size = 70
    sample_step = 40
    X_train, y_train = prepare_sliding_window(df_x=df_tr_x, y=df_tr_y, window_size=window_size, sample_step=sample_step, mdl_type='clf')
    
    # Define the classification model.
    
    warnings.filterwarnings('ignore')

    # Initialize models
    #models = {
    #    'Logistic Regression': LogisticRegression(class_weight='balanced'),
    #    'Decision Tree': DecisionTreeClassifier(class_weight='balanced'),
    #    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    #    'Support Vector Machine': SVC(class_weight='balanced'),
    #    'Gradient Boosting': GradientBoostingClassifier()
    #}
    
    # Initialize models
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced'),
        'Support Vector Machine': SVC(class_weight='balanced'),
    }
    

    # Define hyperparameter grids
    #param_grids = {
    #    'Logistic Regression': {'C': [0.1, 1, 10]},
    #    'Decision Tree': {'max_depth': [20, 30]},
    #    'Random Forest': {'n_estimators': [150]},
    #    'Support Vector Machine': {'C': [0.1, 1, 10], 'gamma': ['scale']},
    #    'Gradient Boosting': {'n_estimators': [100, 150], 'learning_rate': [0.1, 0.5, 1]}
    #}
    
    # Define hyperparameter grids
    param_grids = {
        'Support Vector Machine': {'C': [0.01], 'gamma': ['scale']},
        'Logistic Regression': {'C': [0.01]}
    }

    model_predictions = {}
    best_params = {}
    
    # Perform cross-validation, hyperparameter tuning, and evaluation
    for model_name, model in models.items():
        
        # Create the pipeline
        pipeline = Pipeline([
            ('scaler', MinMaxScaler()), # Step 1 : Normalization
            ('model', model)
        ])
        
        param_grid = {f'model__{key}': value for key, value in param_grids[model_name].items()}
        
        # Hyperparameter tuning
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
        #grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='f1')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params[model_name] = grid_search.best_params_
        
        ## Evaluate on test set
        
        feature_list_all = df_data_experiment.drop(columns=label_columns+drop_list).columns.tolist()
        
        df_test_x = df_test[feature_list_all]
    
        X_test = prepare_sliding_window(df_x=df_test_x, window_size=window_size, sample_step=sample_step, mdl_type='clf')
    
        y_pred = best_model.predict(X_test)
        
        model_predictions[f'y_pred_{model_name.replace(" ", "_")}'] = y_pred    
    
    return model_predictions, best_params

# Motor 1

Feature Selection

In [28]:
drop_list1_label1 = ['data_motor_2_voltage', 'data_motor_3_voltage', 'data_motor_4_voltage', 'data_motor_5_voltage', 'data_motor_6_voltage', 'data_motor_6_position']

drop_list2_label1 = []

Evaluation

In [12]:
model_predictions = run_all_motors('data_motor_1_label',drop_list1_label1)

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 87.42%   | 0.00%   | 0.00%  | 0.00% |
| Decision Tree             | 99.82%   | 99.64%   | 98.92%  | 99.28% |
| Random Forest             | 99.82%   | 100.00%   | 98.56%  | 99.27% |
| Support Vector Machine    | 87.42%   | 0.00%   | 0.00%  | 0.00% |
| Gradient Boosting         | 99.77%   | 99.64%   | 98.56%  | 99.09% |



Prediction on testing data

In [29]:
model_predictions, best_params1 = run_all_motors_validation(1, drop_list1_label1)
#model_predictions, best_params1 = run_all_motors_validation(1, drop_list2_label1)

#y_pred1_Gradient_Boosting = model_predictions['y_pred_Gradient_Boosting']
y_pred1_Logistic_Regression = model_predictions['y_pred_Logistic_Regression']
#y_pred1_Decision_Tree = model_predictions['y_pred_Decision_Tree']
#y_pred1_Random_Forest= model_predictions['y_pred_Random_Forest']
y_pred1_SVC= model_predictions['y_pred_Support_Vector_Machine']

In [10]:
best_params1

{'Logistic Regression': {'model__C': 0.01},
 'Support Vector Machine': {'model__C': 0.01, 'model__gamma': 'scale'}}

# Motor 2

Feature Selection

In [30]:
drop_list1_label2 = ['data_motor_1_voltage', 'data_motor_3_voltage', 'data_motor_4_voltage', 'data_motor_5_voltage', 'data_motor_6_voltage', 'data_motor_5_temperature']

drop_list2_label2 = []

Evaluation

In [None]:
model_predictions = run_all_motors('data_motor_2_label',drop_list1_label2)

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 89.33%   | 24.98%   | 96.81%  | 39.71% |
| Decision Tree             | 99.23%   | 82.84%   | 99.29%  | 90.32% |
| Random Forest             | 99.86%   | 96.89%   | 99.29%  | 98.07% |
| Support Vector Machine    | 95.28%   | 43.22%   | 96.10%  | 59.63% |
| Gradient Boosting         | 99.68%   | 92.13%   | 99.65%  | 95.74% |



Prediction on testing data

In [31]:
model_predictions, best_params2 = run_all_motors_validation(2, drop_list1_label2)
#model_predictions, best_params2 = run_all_motors_validation(2, drop_list2_label2)

#y_pred2_Gradient_Boosting = model_predictions['y_pred_Gradient_Boosting'] # a minimum of 2 classes are required.
y_pred2_Logistic_Regression = model_predictions['y_pred_Logistic_Regression'] # The number of classes has to be greater than one; got 1 class
#y_pred2_Decision_Tree = model_predictions['y_pred_Decision_Tree']
#y_pred2_Random_Forest= model_predictions['y_pred_Random_Forest']
y_pred2_SVC= model_predictions['y_pred_Support_Vector_Machine'] # The number of classes has to be greater than one; got 1 class

In [None]:
best_params2

{'Decision Tree': {'model__max_depth': 20},
 'Random Forest': {'model__min_samples_split': 4, 'model__n_estimators': 100},
 'Gradient Boosting': {'model__learning_rate': 0.1,
  'model__n_estimators': 300}}

# Motor 3

Feature Selection

In [32]:
drop_list1_label3 = ['data_motor_1_voltage','data_motor_2_voltage', 'data_motor_4_voltage', 'data_motor_5_voltage', 'data_motor_6_voltage']
drop_list2_label3 = []

Evaluation

In [None]:
model_predictions = run_all_motors('data_motor_3_label',drop_list1_label3)

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 85.34%   | 19.50%   | 97.16%  | 32.48% |
| Decision Tree             | 99.46%   | 87.74%   | 98.94%  | 93.00% |
| Random Forest             | 99.88%   | 97.56%   | 99.29%  | 98.42% |
| Support Vector Machine    | 95.46%   | 44.21%   | 96.10%  | 60.56% |
| Gradient Boosting         | 99.83%   | 96.22%   | 99.29%  | 97.73% |



Prediction on testing data

In [33]:
model_predictions, best_params3 = run_all_motors_validation(3, drop_list1_label3)
#model_predictions, best_params3 = run_all_motors_validation(3, drop_list2_label3)

#y_pred3_Gradient_Boosting = model_predictions['y_pred_Gradient_Boosting']
y_pred3_Logistic_Regression = model_predictions['y_pred_Logistic_Regression']
#y_pred3_Decision_Tree = model_predictions['y_pred_Decision_Tree']
#y_pred3_Random_Forest= model_predictions['y_pred_Random_Forest']
y_pred3_SVC= model_predictions['y_pred_Support_Vector_Machine']

In [None]:
best_params3

{'Decision Tree': {'model__max_depth': 20},
 'Random Forest': {'model__min_samples_split': 2, 'model__n_estimators': 100},
 'Gradient Boosting': {'model__learning_rate': 0.5,
  'model__n_estimators': 300}}

# Motor 4

Feature selection

In [34]:
drop_list1_label4= ['data_motor_1_voltage', 'data_motor_2_voltage', 'data_motor_3_voltage', 'data_motor_5_voltage', 'data_motor_6_voltage', 'data_motor_5_temperature']
drop_list2_label4 = []

Evaluation

In [None]:
model_predictions = run_all_motors('data_motor_4_label',drop_list1_label4)

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 85.56%   | 19.57%   | 95.74%  | 32.49% |
| Decision Tree             | 99.21%   | 82.79%   | 98.94%  | 90.15% |
| Random Forest             | 99.90%   | 97.90%   | 99.29%  | 98.59% |
| Support Vector Machine    | 95.64%   | 45.21%   | 95.39%  | 61.35% |
| Gradient Boosting         | 99.88%   | 97.56%   | 99.29%  | 98.42% |



Prediction on testing data

In [35]:
model_predictions,best_params4 = run_all_motors_validation(4, drop_list1_label4)
#model_predictions,best_params4 = run_all_motors_validation(4, drop_list2_label4)

#y_pred4_Gradient_Boosting = model_predictions['y_pred_Gradient_Boosting']
y_pred4_Logistic_Regression = model_predictions['y_pred_Logistic_Regression']
#y_pred4_Decision_Tree = model_predictions['y_pred_Decision_Tree']
#y_pred4_Random_Forest= model_predictions['y_pred_Random_Forest']
y_pred4_SVC= model_predictions['y_pred_Support_Vector_Machine']

In [None]:
best_params4

{'Decision Tree': {'model__max_depth': 30},
 'Random Forest': {'model__n_estimators': 150}}

# Motor 5

Feature selection

In [36]:
drop_list1_label5 = ['data_motor_1_voltage','data_motor_2_voltage', 'data_motor_3_voltage', 'data_motor_4_voltage', 'data_motor_6_voltage']
drop_list2_label5 = []

Evaluation

In [None]:
model_predictions = run_all_motors('data_motor_5_label',drop_list1_label5)

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 89.52%   | 25.46%   | 97.87%  | 40.41% |
| Decision Tree             | 99.11%   | 81.05%   | 98.58%  | 88.96% |
| Random Forest             | 99.83%   | 96.22%   | 99.29%  | 97.73% |
| Support Vector Machine    | 95.17%   | 42.70%   | 96.45%  | 59.19% |
| Gradient Boosting         | 99.34%   | 84.89%   | 99.65%  | 91.68% |



Prediction on testing data

In [37]:
model_predictions, best_params5 = run_all_motors_validation(5, drop_list1_label5)
#model_predictions, best_params5 = run_all_motors_validation(5, drop_list2_label5)

#y_pred5_Gradient_Boosting = model_predictions['y_pred_Gradient_Boosting']
y_pred5_Logistic_Regression = model_predictions['y_pred_Logistic_Regression']
#y_pred5_Decision_Tree = model_predictions['y_pred_Decision_Tree']
#y_pred5_Random_Forest= model_predictions['y_pred_Random_Forest']
y_pred5_SVC= model_predictions['y_pred_Support_Vector_Machine']

In [None]:
best_params5

{'Decision Tree': {'model__max_depth': 20},
 'Random Forest': {'model__n_estimators': 150}}

# Motor 6

Feature selection


In [38]:
drop_list1_label6 = ['data_motor_1_voltage','data_motor_2_voltage', 'data_motor_3_voltage', 'data_motor_4_voltage', 'data_motor_5_voltage', 'data_motor_6_position']
drop_list2_label6 = []


Evaluation

In [None]:
model_predictions = run_all_motors('data_motor_6_label',drop_list1_label6)

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 83.50%   | 17.66%   | 96.81%  | 29.87% |
| Decision Tree             | 99.42%   | 86.69%   | 99.29%  | 92.56% |
| Random Forest             | 99.86%   | 97.21%   | 98.94%  | 98.07% |
| Support Vector Machine    | 95.83%   | 46.40%   | 96.10%  | 62.59% |
| Gradient Boosting         | 99.87%   | 96.90%   | 99.65%  | 98.25% |



Prediction on testing data

In [39]:
model_predictions, best_params6 = run_all_motors_validation(6, drop_list1_label6)
#model_predictions, best_params6 = run_all_motors_validation(6, drop_list2_label6)

#y_pred6_Gradient_Boosting = model_predictions['y_pred_Gradient_Boosting']
y_pred6_Logistic_Regression = model_predictions['y_pred_Logistic_Regression']
#y_pred6_Decision_Tree = model_predictions['y_pred_Decision_Tree']
#y_pred6_Random_Forest= model_predictions['y_pred_Random_Forest']
y_pred6_SVC= model_predictions['y_pred_Support_Vector_Machine']

In [None]:
best_params6

{'Decision Tree': {'model__max_depth': 30},
 'Random Forest': {'model__n_estimators': 150}}

## Create csv file for submit Prediction

Logistic_Regression

In [40]:
data_Logistic_Regression = {
    'idx': range(len(y_pred1_Logistic_Regression)),
    'data_motor_1_label': y_pred1_Logistic_Regression,
    'data_motor_2_label': y_pred2_Logistic_Regression,
    'data_motor_3_label': y_pred3_Logistic_Regression,
    'data_motor_4_label': y_pred4_Logistic_Regression,
    'data_motor_5_label': y_pred5_Logistic_Regression,
    'data_motor_6_label': y_pred6_Logistic_Regression
}

df_Logistic_Regression = pd.DataFrame(data_Logistic_Regression)
df_Logistic_Regression.to_csv('motor_predictions_Logistic_Regression.csv', index=False)

Support Vector Machine

In [41]:
data_SVC = {
    'idx': range(len(y_pred1_SVC)),
    'data_motor_1_label': y_pred1_SVC,
    'data_motor_2_label': y_pred2_SVC,
    'data_motor_3_label': y_pred3_SVC,
    'data_motor_4_label': y_pred4_SVC,
    'data_motor_5_label': y_pred5_SVC,
    'data_motor_6_label': y_pred6_SVC
}

df_SVC = pd.DataFrame(data_SVC)

df_SVC.to_csv('motor_predictions_SVC.csv', index=False)

Gradient_Boosting

In [152]:
data_Gradient_Boosting = {
    'idx': range(len(y_pred1_Gradient_Boosting)),
    'data_motor_1_label': y_pred1_Gradient_Boosting,
    'data_motor_2_label': y_pred2_Gradient_Boosting,
    'data_motor_3_label': y_pred3_Gradient_Boosting,
    'data_motor_4_label': y_pred4_Gradient_Boosting,
    'data_motor_5_label': y_pred5_Gradient_Boosting,
    'data_motor_6_label': y_pred6_Gradient_Boosting
}

df_Gradient_Boosting = pd.DataFrame(data_Gradient_Boosting)

df_Gradient_Boosting.to_csv('motor_predictions_Gradient_Boosting.csv', index=False)

Random_Forest

In [153]:
data_Random_Forest = {
    'idx': range(len(y_pred1_Random_Forest)),
    'data_motor_1_label': y_pred1_Random_Forest,
    'data_motor_2_label': y_pred2_Random_Forest,
    'data_motor_3_label': y_pred2_Random_Forest,
    'data_motor_4_label': y_pred4_Random_Forest,
    'data_motor_5_label': y_pred5_Random_Forest,
    'data_motor_6_label': y_pred6_Random_Forest
}

df_Random_Forest = pd.DataFrame(data_Random_Forest)

df_Random_Forest.to_csv('motor_predictions_Random_Forest.csv', index=False)

Decision_Tree

In [154]:
data_Decision_Tree = {
    'idx': range(len(y_pred1_Decision_Tree)),
    'data_motor_1_label': y_pred1_Decision_Tree,
    'data_motor_2_label': y_pred2_Decision_Tree,
    'data_motor_3_label': y_pred3_Decision_Tree,
    'data_motor_4_label': y_pred4_Decision_Tree,
    'data_motor_5_label': y_pred5_Decision_Tree,
    'data_motor_6_label': y_pred6_Decision_Tree
}

df_Decision_Tree = pd.DataFrame(data_Decision_Tree)

df_Decision_Tree.to_csv('motor_predictions_Decision_Tree.csv', index=False)

## Summary of the best results

| Model   | Drop_list | window_size | sample_step | F1   |
|---------|----------|-------------|-------------|------|
| Logistic Regression | 2 | 70 | 30 | 0.37|
| SVC | 2 | 70 | 30 | 0.45|
| Random Forest | 2 | 70 | 30 | 0.13 | 
| Decision Tree  |  2 | 70 | 30 |  0.19 | 
| Gradient Boosting  |  2 | 70 | 30 | 0.12 | 