In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import pickle
from pandas_profiling import ProfileReport
import numpy as np
%matplotlib inline
sns.set( style = "whitegrid", color_codes = True)

In [38]:
df = pd.read_csv('Data.csv')

In [39]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [40]:
df.columns

Index(['UDI', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF'],
      dtype='object')

In [41]:
df['Machine failure'].value_counts()

0    9661
1     339
Name: Machine failure, dtype: int64

In [42]:
df[df['Machine failure']==1]

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
50,51,L47230,L,298.9,309.1,2861,4.6,143,1,0,0,1,0,0
69,70,L47249,L,298.9,309.0,1410,65.7,191,1,0,0,1,1,0
77,78,L47257,L,298.8,308.9,1455,41.3,208,1,1,0,0,0,0
160,161,L47340,L,298.4,308.2,1282,60.7,216,1,0,0,0,1,0
161,162,L47341,L,298.3,308.1,1412,52.3,218,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9758,9759,L56938,L,298.6,309.8,2271,16.2,218,1,1,0,0,0,0
9764,9765,L56944,L,298.5,309.5,1294,66.7,12,1,0,0,1,0,0
9822,9823,L57002,L,298.5,309.4,1360,60.9,187,1,0,0,0,1,0
9830,9831,L57010,L,298.3,309.3,1337,56.1,206,1,0,0,0,1,0


#feature selection

In [43]:
df1 = df.drop(["UDI", "Product ID"], axis = 1)

In [44]:
# map the type
df1['Type'] = df1['Type'].map({"L" : 0 , "M" : 1 , "H" : 2 })

In [45]:
columns = [i.replace(' ' , '_')for i in df1.columns] 

In [46]:
dict_ = {i : j for i , j in zip(df1.columns , columns)}

In [47]:
df1 = df1.rename(columns = dict_)

In [48]:
df1.columns

Index(['Type', 'Air_temperature_[K]', 'Process_temperature_[K]',
       'Rotational_speed_[rpm]', 'Torque_[Nm]', 'Tool_wear_[min]',
       'Machine_failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [49]:
x = df1.drop(['Air_temperature_[K]'] , axis = 1)

In [50]:
y = df1['Air_temperature_[K]']

In [51]:
## we will need to standarise  the variable as the temperature , torque etc is on different scale 
scaler = StandardScaler()
arr = scaler.fit_transform(x)

In [52]:
# profile report after scaling 
df3 = pd.DataFrame(arr, columns=x.columns)

In [53]:
df3

Unnamed: 0,Type,Process_temperature_[K],Rotational_speed_[rpm],Torque_[Nm],Tool_wear_[min],Machine_failure,TWF,HDF,PWF,OSF,RNF
0,0.744413,-0.947360,0.068185,0.282200,-1.695984,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
1,-0.745307,-0.879959,-0.729472,0.633308,-1.648852,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
2,-0.745307,-1.014761,-0.227450,0.944290,-1.617430,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
3,-0.745307,-0.947360,-0.590021,-0.048845,-1.586009,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
4,-0.745307,-0.879959,-0.729472,0.001313,-1.554588,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
...,...,...,...,...,...,...,...,...,...,...,...
9995,0.744413,-1.082162,0.363820,-1.052012,-1.476034,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
9996,2.234132,-1.082162,0.520005,-0.821283,-1.428902,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
9997,0.744413,-0.947360,0.592519,-0.660777,-1.350349,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
9998,2.234132,-0.879959,-0.729472,0.854005,-1.303217,-0.187322,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363


In [54]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [55]:
vif_df = pd.DataFrame()
vif_df["Feature"] = x.columns

In [56]:
vif_df['vif']= [variance_inflation_factor(arr,i) for i in range(arr.shape[1])]

In [57]:
vif_df

Unnamed: 0,Feature,vif
0,Type,1.003726
1,Process_temperature_[K],1.00492
2,Rotational_speed_[rpm],5.171728
3,Torque_[Nm],5.236158
4,Tool_wear_[min],1.039958
5,Machine_failure,11.831609
6,TWF,2.433879
7,HDF,4.597163
8,PWF,3.624287
9,OSF,3.348866


In [58]:
# here we can see that machine failure is a more than 10 
# so we drop  machine as its is just a represnation of all the othe failure 
df_count =df[df['Machine failure']==1][['TWF', 'HDF', 'PWF', 'OSF', 'RNF']].apply(pd.value_counts)
df_count

Unnamed: 0,TWF,HDF,PWF,OSF,RNF
0,293,224,244,241,338
1,46,115,95,98,1


In [59]:
df3.drop(['Machine_failure'] , axis = 1 , inplace = True)

In [60]:
x = df3
y = df1["Air_temperature_[K]"]
x_train,x_test, y_train, y_test  = train_test_split(x, y, test_size = 0.25, random_state =100)

In [61]:
model = LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

In [62]:
list(zip(x.columns,model.coef_))

[('Type', 0.0028956645895387395),
 ('Process_temperature_[K]', 1.7318459414454095),
 ('Rotational_speed_[rpm]', 0.02778396283458779),
 ('Torque_[Nm]', -0.0017530854154749742),
 ('Tool_wear_[min]', 0.00850411102358462),
 ('TWF', 0.010957939299885644),
 ('HDF', 0.18088706851064232),
 ('PWF', 0.012051477309352679),
 ('OSF', -0.015908824019811082),
 ('RNF', -0.005126255401818821)]

In [63]:
model.score(x_test,y_test)

0.7964125774373565

In [64]:
import statsmodels.api as sm
x = df1.drop(['Air_temperature_[K]'] , axis = 1)
y = df1['Air_temperature_[K]']
X = sm.add_constant(x)
model = sm.OLS(y,X)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,Air_temperature_[K],R-squared:,0.776
Model:,OLS,Adj. R-squared:,0.775
Method:,Least Squares,F-statistic:,3140.0
Date:,"Tue, 29 Mar 2022",Prob (F-statistic):,0.0
Time:,20:12:57,Log-Likelihood:,-13648.0
No. Observations:,10000,AIC:,27320.0
Df Residuals:,9988,BIC:,27410.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-64.1776,2.000,-32.092,0.000,-68.098,-60.258
Type,-0.0107,0.014,-0.754,0.451,-0.038,0.017
Process_temperature_[K],1.1737,0.006,183.261,0.000,1.161,1.186
Rotational_speed_[rpm],0.0002,0.000,1.578,0.115,-4.59e-05,0.000
Torque_[Nm],0.0003,0.002,0.123,0.902,-0.004,0.005
Tool_wear_[min],8.454e-05,0.000,0.557,0.578,-0.000,0.000
Machine_failure,-0.0918,0.180,-0.510,0.610,-0.445,0.261
TWF,0.2048,0.219,0.937,0.349,-0.224,0.633
HDF,1.7801,0.191,9.338,0.000,1.406,2.154

0,1,2,3
Omnibus:,647.442,Durbin-Watson:,0.074
Prob(Omnibus):,0.0,Jarque-Bera (JB):,241.259
Skew:,-0.091,Prob(JB):,4.09e-53
Kurtosis:,2.261,Cond. No.,334000.0


In [65]:
from sklearn.linear_model import ElasticNet , ElasticNetCV

elasticv = ElasticNetCV(l1_ratio=0.5,eps=0.001,n_alphas=100,alphas=None,fit_intercept=True,normalize=True,precompute='auto',max_iter=10000,
             tol=0.0001,cv=10,copy_X=True,verbose=0,n_jobs=None,positive=False,random_state=45,selection='cyclic')

In [66]:
elasticv.fit(x_train,y_train)

ElasticNetCV(cv=10, max_iter=10000, normalize=True, random_state=45)

In [67]:
elastic = ElasticNet(alpha = elasticv.alpha_, l1_ratio = elasticv.l1_ratio_)
elastic.fit(x_train, y_train)
elastic.score(x_test, y_test)

0.7964128136126615

In [68]:
pickle.dump(elastic, open('model_elastic.pkl','wb'))

In [69]:
elasticv.predict(x_test)

array([301.97272347, 299.11494117, 300.91830339, ..., 298.99895077,
       303.2643562 , 298.86871223])

In [72]:
rmse = np.sqrt(mean_squared_error(y_test, elasticv.predict(x_test)))

In [71]:
from sklearn.metrics import mean_squared_error

In [73]:
rmse

0.96491707310211