In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('processed_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,air_time1,disp_index1,gmrt_in_air1,gmrt_on_paper1,max_x_extension1,max_y_extension1,mean_acc_in_air1,mean_acc_on_paper1,...,mean_jerk_in_air25,mean_jerk_on_paper25,mean_speed_in_air25,mean_speed_on_paper25,num_of_pendown25,paper_time25,pressure_mean25,pressure_var25,total_time25,class
0,0,id_1,5160,1.3e-05,120.804174,86.853334,957,6601,0.3618,0.217459,...,0.141434,0.024471,5.596487,3.184589,71,40120,1749.278166,296102.7676,144605,1
1,1,id_2,51980,1.6e-05,115.318238,83.448681,1694,6998,0.272513,0.14488,...,0.049663,0.018368,1.665973,0.950249,129,126700,1504.768272,278744.285,298640,1
2,2,id_3,2600,1e-05,229.933997,172.761858,2333,5802,0.38702,0.181342,...,0.178194,0.017174,4.000781,2.392521,74,45480,1431.443492,144411.7055,79025,1
3,3,id_4,2130,1e-05,369.403342,183.193104,1756,8159,0.556879,0.164502,...,0.113905,0.01986,4.206746,1.613522,123,67945,1465.843329,230184.7154,181220,1
4,4,id_5,2310,7e-06,257.997131,111.275889,987,4732,0.266077,0.145104,...,0.121782,0.020872,3.319036,1.680629,92,37285,1841.702561,158290.0255,72575,1


In [3]:
# Drop the 'ID' column as it's not needed for the prediction
df = df.drop(columns=['ID'])

# Step 2: We are trying to predict the value of 'Class'
X = df.drop(columns=['class'])
y = df['class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalize the features
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=13)

# Step 4: Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("Testing Accuracy: " + str(model.score(X_test,y_test)))
print("Training Accuracy: " + str(model.score(X_train,y_train)))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Testing Accuracy: 0.8571428571428571
Training Accuracy: 0.9568345323741008
Confusion Matrix:
 [[15  1]
 [ 4 15]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.94      0.86        16
           1       0.94      0.79      0.86        19

    accuracy                           0.86        35
   macro avg       0.86      0.86      0.86        35
weighted avg       0.87      0.86      0.86        35



In [4]:
# Step 6: Make predictions
predictions = model.predict(X_test)

# Compare True results vs Prediction results
comparison_df = pd.DataFrame({'True': y_test, 'Predicted': y_pred, 'Model Probability': y_pred_proba})
comparison_df.head()


Unnamed: 0,True,Predicted,Model Probability
170,0,0,0.342026
7,1,0,0.338117
104,0,0,0.385061
93,0,0,0.234167
10,1,1,0.909788
