In [8]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

text_df = pd.read_csv('/content/drive/MyDrive/Coursework/2024-2025/2024 Fall/CSCI 5541 Natural Language Processing/Group Project/Phish and Chips Final Project/data/final_results_keywords30_sample_100.csv')
text_normal_df = pd.read_csv('/content/drive/MyDrive/Coursework/2024-2025/2024 Fall/CSCI 5541 Natural Language Processing/Group Project/Phish and Chips Final Project/data/final_results_keywords30_sample_100_normalized.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Define features and labels
features = text_df[['confidence', 'avg_zipf_score', 'commonality_score',
                 'adjusted_score', 'total_words', 'average_syllables_per_word',
                 'average_characters_per_word', 'ARI', 'Flesch-Kincaid Grade']]
labels = text_df['label']

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Step 6: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Feature importance
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("Feature Importance:\n", feature_importance)

feature_importance.to_csv('/content/drive/MyDrive/Coursework/2024-2025/2024 Fall/CSCI 5541 Natural Language Processing/Group Project/Phish and Chips Final Project/data/feature_importance.csv', index=False)

Accuracy: 1.0
ROC-AUC: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00         7

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Feature Importance:
                        Feature  Coefficient
0                   confidence     2.993977
5   average_syllables_per_word     0.504654
7                          ARI     0.178134
4                  total_words     0.104803
2            commonality_score     0.072311
3               adjusted_score    -0.072311
1               avg_zipf_score    -0.166475
6  average_characters_per_word    -0.219793
8         Flesch-Kincaid Grade    -0.239904


In [17]:
intercept = model.intercept_[0]
print("Intercept:", intercept)

Intercept: -0.5678807362408317


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Define features and labels
features = text_normal_df[['confidence', 'avg_zipf_score', 'commonality_score',
                 'adjusted_score', 'total_words', 'average_syllables_per_word',
                 'average_characters_per_word', 'ARI', 'Flesch-Kincaid Grade']]
labels = text_normal_df['label']

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Step 6: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Feature importance
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("Feature Importance:\n", feature_importance)

feature_importance.to_csv('/content/drive/MyDrive/Coursework/2024-2025/2024 Fall/CSCI 5541 Natural Language Processing/Group Project/Phish and Chips Final Project/data/feature_importance_normalized.csv', index=False)

Accuracy: 1.0
ROC-AUC: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00         7

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Feature Importance:
                        Feature  Coefficient
0                   confidence     2.993977
5   average_syllables_per_word     0.504654
7                          ARI     0.178134
4                  total_words     0.104803
2            commonality_score     0.072311
3               adjusted_score    -0.072311
1               avg_zipf_score    -0.166475
6  average_characters_per_word    -0.219793
8         Flesch-Kincaid Grade    -0.239904


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Define features and labels
features = text_df[['avg_zipf_score', 'commonality_score',
                 'adjusted_score', 'total_words', 'average_syllables_per_word',
                 'average_characters_per_word', 'ARI', 'Flesch-Kincaid Grade']]
labels = text_df['label']

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Step 6: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Feature importance
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("Feature Importance:\n", feature_importance)

feature_importance.to_csv('/content/drive/MyDrive/Coursework/2024-2025/2024 Fall/CSCI 5541 Natural Language Processing/Group Project/Phish and Chips Final Project/data/feature_importance_withoutRoberta.csv', index=False)

Accuracy: 0.65
ROC-AUC: 0.6923076923076923
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.69      0.72        13
           1       0.50      0.57      0.53         7

    accuracy                           0.65        20
   macro avg       0.62      0.63      0.63        20
weighted avg       0.66      0.65      0.65        20

Feature Importance:
                        Feature  Coefficient
4   average_syllables_per_word     0.593366
5  average_characters_per_word     0.254645
3                  total_words     0.132057
1            commonality_score     0.095453
6                          ARI    -0.046090
2               adjusted_score    -0.095453
0               avg_zipf_score    -0.142943
7         Flesch-Kincaid Grade    -0.675097


In [12]:
intercept = model.intercept_[0]  # The intercept (beta_0)
print("Intercept:", intercept)

Intercept: -0.21783556457849426


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Define features and labels
features = text_normal_df[['avg_zipf_score', 'commonality_score',
                 'adjusted_score', 'total_words', 'average_syllables_per_word',
                 'average_characters_per_word', 'ARI', 'Flesch-Kincaid Grade']]
labels = text_normal_df['label']

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Step 6: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Feature importance
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("Feature Importance:\n", feature_importance)

feature_importance.to_csv('/content/drive/MyDrive/Coursework/2024-2025/2024 Fall/CSCI 5541 Natural Language Processing/Group Project/Phish and Chips Final Project/data/feature_importance_withoutRoberta_normalized.csv', index=False)

Accuracy: 0.65
ROC-AUC: 0.6923076923076923
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.69      0.72        13
           1       0.50      0.57      0.53         7

    accuracy                           0.65        20
   macro avg       0.62      0.63      0.63        20
weighted avg       0.66      0.65      0.65        20

Feature Importance:
                        Feature  Coefficient
4   average_syllables_per_word     0.593366
5  average_characters_per_word     0.254645
3                  total_words     0.132057
1            commonality_score     0.095453
6                          ARI    -0.046090
2               adjusted_score    -0.095453
0               avg_zipf_score    -0.142943
7         Flesch-Kincaid Grade    -0.675097
