In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [16]:
darwin_df = pd.read_csv('processed_df.csv')
print(darwin_df)

     Unnamed: 0      ID  air_time1  disp_index1  gmrt_in_air1  gmrt_on_paper1  \
0             0    id_1       5160     0.000013    120.804174       86.853334   
1             1    id_2      51980     0.000016    115.318238       83.448681   
2             2    id_3       2600     0.000010    229.933997      172.761858   
3             3    id_4       2130     0.000010    369.403342      183.193104   
4             4    id_5       2310     0.000007    257.997131      111.275889   
..          ...     ...        ...          ...           ...             ...   
169         169  id_170       2930     0.000010    241.736477      176.115957   
170         170  id_171       2140     0.000009    274.728964      234.495802   
171         171  id_172       3830     0.000008    151.536989      171.104693   
172         172  id_173       1760     0.000008    289.518195      196.411138   
173         173  id_174       2875     0.000008    235.769350      178.208024   

     max_x_extension1  max_

In [17]:
# Drop the 'ID' column as it's not needed for the prediction
df = darwin_df.drop(columns=['ID'])

# Step 2: We are trying to predict the value of 'Class'
X = df.drop(columns=['class'])
y = df['class']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalize the features
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=13)

# Step 4: Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("Testing Accuracy: " + str(model.score(X_test,y_test)))
print("Training Accuracy: " + str(model.score(X_train,y_train)))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Testing Accuracy: 0.8571428571428571
Training Accuracy: 0.9568345323741008
Confusion Matrix:
 [[15  1]
 [ 4 15]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.94      0.86        16
           1       0.94      0.79      0.86        19

    accuracy                           0.86        35
   macro avg       0.86      0.86      0.86        35
weighted avg       0.87      0.86      0.86        35



In [18]:
# Step 6: Make predictions
predictions = model.predict(X_test)

# Compare True results vs Prediction results
comparison_df = pd.DataFrame({'True': y_test, 'Predicted': y_pred, 'Model Probability': y_pred_proba})
comparison_df.head()


Unnamed: 0,True,Predicted,Model Probability
170,0,0,0.342026
7,1,0,0.338117
104,0,0,0.385061
93,0,0,0.234167
10,1,1,0.909788
