# Given a csv file, extract the data and run linear regression analysis on data.

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



In [28]:
# Load the data
data = pd.read_csv('ai4i2020.csv')

# preview data frame
data.head(5)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [29]:
#  the shape of the data
data.shape 

(10000, 14)

In [30]:
# information on the data 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

In [None]:
# drop column that are not needed
data.drop(labels=['Product ID','Type','TWF','HDF',	'PWF',	'OSF',	'RNF'],axis=1,inplace=True)
data

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure
0,1,298.1,308.6,1551,42.8,0,0
1,2,298.2,308.7,1408,46.3,3,0
2,3,298.1,308.5,1498,49.4,5,0
3,4,298.2,308.6,1433,39.5,7,0
4,5,298.2,308.7,1408,40.0,9,0
...,...,...,...,...,...,...,...
9995,9996,298.8,308.4,1604,29.5,14,0
9996,9997,298.9,308.4,1632,31.8,17,0
9997,9998,299.0,308.6,1645,33.4,22,0
9998,9999,299.0,308.7,1408,48.5,25,0


In [None]:

# Separate variable from features
X = data.drop('Machine failure', axis=1)
y = data['Machine failure']
data

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure
0,1,298.1,308.6,1551,42.8,0,0
1,2,298.2,308.7,1408,46.3,3,0
2,3,298.1,308.5,1498,49.4,5,0
3,4,298.2,308.6,1433,39.5,7,0
4,5,298.2,308.7,1408,40.0,9,0
...,...,...,...,...,...,...,...
9995,9996,298.8,308.4,1604,29.5,14,0
9996,9997,298.9,308.4,1632,31.8,17,0
9997,9998,299.0,308.6,1645,33.4,22,0
9998,9999,299.0,308.7,1408,48.5,25,0


In [33]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [34]:

# Check for non-numeric columns and convert them to numeric if possible
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values that could not be converted to numeric
X_train = X_train.dropna()
X_test = X_test.dropna()

# Ensure y_train and y_test are aligned with the cleaned X_train and X_test
y_train = y_train[X_train.index]
y_test = y_test[X_test.index]

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)



In [35]:
# Make predictions
y_pred = model.predict(X_test)
y_pred


array([0.0272956 , 0.06506194, 0.03574459, ..., 0.05672005, 0.252866  ,
       0.04634286], shape=(2000,))

In [36]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.025939557000236944
R^2 Score: 0.1227671184153758
