The task is to predict the condition of a motor based on the features provided.

This is ultimately a supervised learning task, but we will add PCA to our pre-processing step.

The goal here is to get the highest overall accuracy.

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
import pandas as pd
from seaborn import heatmap
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [37]:
data = '/content/drive/MyDrive/Data/Sensorless_RAW.xlsx'

df = pd.read_excel(data)


In [38]:
# Initial exploration of data
display(df.head())
print('\n\n')
df.info()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column40,Column41,Column42,Column43,Column44,Column45,Column46,Column47,Column48,Column49
0,-3.0146e-07,8.2603e-06,-1.2e-05,-2e-06,-1.4386e-06,-2.1e-05,0.031718,0.03171,0.031721,-0.032963,...,-0.63308,2.9646,8.1198,-1.4961,-1.4961,-1.4961,-1.4996,-1.4996,-1.4996,1
1,2.9132e-06,-5.2477e-06,3e-06,-6e-06,2.7789e-06,-4e-06,0.030804,0.03081,0.030806,-0.03352,...,-0.59314,7.6252,6.169,-1.4967,-1.4967,-1.4967,-1.5005,-1.5005,-1.5005,1
2,-2.9517e-06,-3.184e-06,-1.6e-05,-1e-06,-1.5753e-06,1.7e-05,0.032877,0.03288,0.032896,-0.029834,...,-0.63252,2.7784,5.3017,-1.4983,-1.4983,-1.4982,-1.4985,-1.4985,-1.4985,1
3,-1.3226e-06,8.8201e-06,-1.6e-05,-5e-06,-7.2829e-07,4e-06,0.02941,0.029401,0.029417,-0.030156,...,-0.62289,6.5534,6.2606,-1.4963,-1.4963,-1.4963,-1.4975,-1.4975,-1.4976,1
4,-6.8366e-08,5.6663e-07,-2.6e-05,-6e-06,-7.9406e-07,1.3e-05,0.030119,0.030119,0.030145,-0.031393,...,-0.6301,4.5155,9.5231,-1.4958,-1.4958,-1.4958,-1.4959,-1.4959,-1.4959,1





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58509 entries, 0 to 58508
Data columns (total 49 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column1   58509 non-null  float64
 1   Column2   58509 non-null  float64
 2   Column3   58509 non-null  float64
 3   Column4   58509 non-null  float64
 4   Column5   58509 non-null  float64
 5   Column6   58509 non-null  float64
 6   Column7   58509 non-null  float64
 7   Column8   58509 non-null  float64
 8   Column9   58509 non-null  float64
 9   Column10  58509 non-null  float64
 10  Column11  58509 non-null  float64
 11  Column12  58509 non-null  float64
 12  Column13  58509 non-null  float64
 13  Column14  58509 non-null  float64
 14  Column15  58509 non-null  float64
 15  Column16  58509 non-null  float64
 16  Column17  58509 non-null  float64
 17  Column18  58509 non-null  float64
 18  Column19  58509 non-null  float64
 19  Column20  58509 non-null  float64
 20  Column21  58509 non-null 

In [39]:
# Do a quick look at the stats. Keep in mind that outliers may be indicators of defects 
df.describe()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column40,Column41,Column42,Column43,Column44,Column45,Column46,Column47,Column48,Column49
count,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,...,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0,58509.0
mean,-3e-06,1.439648e-06,1.412013e-06,-1e-06,1.351239e-06,-2.654483e-07,0.001915,0.001913,0.001912,-0.011897,...,-0.397757,7.293781,8.273772,-1.500887,-1.500912,-1.500805,-1.497771,-1.497794,-1.497686,6.0
std,7.2e-05,5.555429e-05,0.0002353009,6.3e-05,5.660943e-05,0.0002261907,0.036468,0.036465,0.03647,0.066482,...,25.018728,12.451781,6.565952,0.003657,0.003668,0.003632,0.003163,0.003163,0.003175,3.162305
min,-0.013721,-0.0054144,-0.01358,-0.012787,-0.0083559,-0.0097413,-0.13989,-0.13594,-0.13086,-0.21864,...,-0.90235,-0.59683,0.32066,-1.5255,-1.5262,-1.5237,-1.5214,-1.5232,-1.5213,1.0
25%,-7e-06,-1.4444e-05,-7.2396e-05,-5e-06,-1.4753e-05,-7.3791e-05,-0.019927,-0.019951,-0.019925,-0.032144,...,-0.71547,1.4503,4.4363,-1.5033,-1.5034,-1.5032,-1.4996,-1.4996,-1.4995,3.0
50%,-3e-06,8.8046e-07,5.1377e-07,-1e-06,7.5402e-07,-1.6593e-07,0.013226,0.01323,0.013247,-0.015566,...,-0.66171,3.3013,6.4791,-1.5003,-1.5003,-1.5003,-1.4981,-1.4981,-1.498,6.0
75%,2e-06,1.8777e-05,7.52e-05,4e-06,1.9062e-05,7.1386e-05,0.02477,0.024776,0.024777,0.020614,...,-0.57398,8.2885,9.8575,-1.4982,-1.4982,-1.4982,-1.4962,-1.4963,-1.4962,9.0
max,0.005784,0.0045253,0.0052377,0.001453,0.00082451,0.0027536,0.069125,0.06913,0.069131,0.35258,...,3670.8,889.93,153.15,-1.4576,-1.4561,-1.4555,-1.3372,-1.3372,-1.3371,11.0


In [40]:
# Explore the class balance in the target column.
df['Column49'].value_counts()

1     5319
2     5319
3     5319
4     5319
5     5319
6     5319
7     5319
8     5319
9     5319
10    5319
11    5319
Name: Column49, dtype: int64

In [41]:
# Identify the target and features
X = df.drop(columns = 'Column49')
y = df['Column49']

In [42]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47)

In [43]:
# Prepare preprocessing pipeline
scaler = StandardScaler()
pca = PCA(n_components=.95)

preprocess = make_pipeline(scaler, pca)

In [44]:
# Check shape of new features
pca_train = preprocess.fit_transform(X_train)

pca_train.shape

(43881, 20)

We were able to reduce 48 columns to only 20 while retaining 95% of the information (variance) of the original features.

# Without PCA


In [45]:
#instantiate Random Forest
rf = RandomForestClassifier()

rf_nopca = make_pipeline(scaler, rf)

In [46]:
%%time 
rf_nopca.fit(X_train, y_train)

print(f'Random Forest Training without PCA: {rf_nopca.score(X_train, y_train):.3f}')
print(f'Random Forest Testing without PCA: {rf_nopca.score(X_test, y_test):.3f}')

Random Forest Training without PCA: 1.000
Random Forest Testing without PCA: 0.998
CPU times: user 28.5 s, sys: 92.9 ms, total: 28.6 s
Wall time: 28.5 s


# With PCA

In [47]:
rf_pca = make_pipeline(preprocess, rf)

In [48]:
%%time
rf_pca.fit(X_train, y_train)

print(f'Random Forest Training with PCA: {rf_pca.score(X_train, y_train):.3f}')
print(f'Random Forest Testing with PCA: {rf_pca.score(X_test, y_test):.3f}')

Random Forest Training with PCA: 1.000
Random Forest Testing with PCA: 0.936
CPU times: user 26.2 s, sys: 509 ms, total: 26.7 s
Wall time: 26.2 s


# This is just a default model, but we could try a gridsearch or other classifier algorithms to reduce overfitting and see if we could get a better result!