In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

---

## Split the Data into Training and Testing Sets

In [2]:
# Load the test data into a Pandas DataFrame
df_test = pd.read_csv("https://machine-failure-data-20230822-craiguo.s3.us-west-2.amazonaws.com/test.csv")
df_test.drop(columns = ["id", 'Product ID'], inplace = True)
# Display sample data
df_test.head(10)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,L,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,L,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,L,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,M,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,M,303.4,312.3,1515,41.3,114,0,0,0,0,0
5,L,299.1,308.3,1489,38.2,139,0,0,0,0,0
6,L,299.8,309.1,1429,39.9,207,0,0,0,0,0
7,L,302.7,312.4,1540,46.2,17,0,0,0,0,0
8,H,300.7,311.9,1613,36.0,12,0,0,0,0,0
9,L,300.5,311.4,1708,32.2,57,0,0,0,0,0


In [3]:
# Load the test data into a Pandas DataFrame
df_train = pd.read_csv("https://machine-failure-data-20230822-craiguo.s3.us-west-2.amazonaws.com/train.csv")
df_train.drop(columns = ["id", "Product ID"], inplace = True)
# Display sample data
df_train.head(10)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
5,M,298.4,308.9,1429,42.1,65,0,0,0,0,0,0
6,L,299.6,311.0,1413,42.9,156,0,0,0,0,0,0
7,L,298.7,310.1,1609,38.1,67,0,0,0,0,0,0
8,L,297.7,308.8,1578,35.2,13,0,0,0,0,0,0
9,L,300.5,312.3,1447,53.3,98,0,0,0,0,0,0


In [4]:
# Look at nunique counts for train
df_train.nunique()

Type                         3
Air temperature [K]         95
Process temperature [K]     81
Rotational speed [rpm]     952
Torque [Nm]                611
Tool wear [min]            246
Machine failure              2
TWF                          2
HDF                          2
PWF                          2
OSF                          2
RNF                          2
dtype: int64

In [5]:
# Look at nunique counts for test
df_test.nunique()

Type                         3
Air temperature [K]         92
Process temperature [K]     84
Rotational speed [rpm]     946
Torque [Nm]                595
Tool wear [min]            246
TWF                          2
HDF                          2
PWF                          2
OSF                          2
RNF                          2
dtype: int64

In [6]:
# concatenate the dataframes to use pd.get_dummies() on 'Product ID' and 'Type'

combined_df = pd.concat([df_train, df_test], sort=False)
combined_df.tail()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
90949,L,302.3,311.4,1484,40.4,15,,0,0,0,0,0
90950,L,297.9,309.8,1542,33.8,31,,0,0,0,0,0
90951,L,295.6,306.2,1501,41.4,187,,0,0,0,0,0
90952,L,298.1,307.8,1534,40.3,69,,0,0,0,0,0
90953,L,303.5,312.8,1534,36.1,92,,0,0,0,0,0


In [7]:
#use pd.get_dummies() for 'Product ID' and 'Type' and split test and train sets again

dummies_df = pd.get_dummies(combined_df)

col = df_train.shape[0]

dummy_train = dummies_df.iloc[0:col]
dummy_test = dummies_df.iloc[col:]

### Step 2: Create features and target arrays

In [8]:
# Split our preprocessed data into our features and target arrays

y = dummy_train['Machine failure']
X = dummy_train.drop(columns = 'Machine failure')

In [9]:
# Review the y variable Series
y[:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Machine failure, dtype: float64

In [10]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
0,300.6,309.6,1596,36.1,140,0,0,0,0,0,0,1,0
1,302.6,312.1,1759,29.1,200,0,0,0,0,0,0,0,1
2,299.3,308.5,1805,26.5,25,0,0,0,0,0,0,1,0
3,301.0,310.9,1524,44.3,197,0,0,0,0,0,0,1,0
4,298.0,309.0,1641,35.4,34,0,0,0,0,0,0,0,1


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
#Scale the data using StandardScaler()
from sklearn.preprocessing import StandardScaler
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [13]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver="lbfgs", random_state=1)

# Fit the model using training data
classifier.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [14]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [15]:
# Generate a confusion matrix for the model
print(confusion_matrix(y_test, predictions))

[[33559     5]
 [  131   413]]


In [16]:
#print the accuracy score
print(f"accuracy: {accuracy_score(y_test, predictions)}")
# Print the classification report for the model
print(classification_report(y_test, predictions))

accuracy: 0.9960126656502873
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     33564
         1.0       0.99      0.76      0.86       544

    accuracy                           1.00     34108
   macro avg       0.99      0.88      0.93     34108
weighted avg       1.00      1.00      1.00     34108

