In [28]:
from utils import get_dict_data_optimized,clean_text,text_lemmatize
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support


# Read the Data

In [29]:
# # Load data using the optimized function from utils.py wich use multithreading to load data faster
# data = get_dict_data_optimized("training")

# Tokenize the Data

In [30]:
# data["tokens"] = data.apply(lambda x:word_tokenize(x["text"]),axis=1)

# Apply Cleaning Functions to the Data
- [ ] 1. Remove punctuation
- [ ] 2. Remove stopwords
- [ ] 3. Remove numbers
- [ ] 4. Remove words with less than 3 characters
- [ ] 5. Remove words that are not in the word list (dictionary)

In [31]:
# data["clean_text"] = data.apply(clean_text,axis=1)

# Lemmatize the Data

In [32]:
# data["clean_text"] = data.apply(text_lemmatize,axis=1)

# Save the preprocessed data to a file
> this becouse the preprocessing takes a long time and we don't want to do it every time we run the code again so we save the preprocessed data to a file and we can just read it from the file when we want to use it.

In [33]:
# save the data frame to a csv file called "clean_data.csv"
# data.to_csv("clean_data.csv",index=False)
# load the data frame from the csv file
data = pd.read_csv("clean_data.csv")
# drop any rows with missing values
data.dropna(inplace=True)

# Feature Engineering
- [ ] 1. Create a new column that contains the length of the text (Scale it using MinMaxScaler)
- [ ] 2. Crate 91 columns for each label if its appears in the text or not (Binary Encoding)

In [34]:
# Create a column for the number of words in the clean_text
data['num_words'] = data['clean_text'].apply(lambda x: len(x.split()))
# initialize the scaler
scaler = MinMaxScaler()
# fit the scaler to the num_words column 
scaler.fit(data['num_words'].values.reshape(-1,1))
# transform the num_words column
data['num_words'] = scaler.transform(data['num_words'].values.reshape(-1,1))
# get the unique labels
labels_values = data["label"].unique()
# create a column for each label and assign 1 if the clean_text contains the label and 0 otherwise 
for label in labels_values:
    data[label] = data["clean_text"].apply(lambda x: 1 if label in x else 0)

In [35]:
# get the new  columns names we created
labels_values = list(data.columns[4:]) 
# append the clean_text column to the labels_values list
labels_values.append('clean_text')
# split the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(data[labels_values], data['label'], test_size=0.15, random_state=42)

In [36]:
labels_values.remove('clean_text') # remove the clean_text column from the labels_values list (Not needed for the model)

In [37]:
# Vectorize the training data using TF-IDF
vectorizer = TfidfVectorizer()
# fit the vectorizer to the clean_text column
train_vectors = vectorizer.fit_transform(train_data['clean_text'])
# update the train_vectors with the new columns we created
train_data[labels_values] = train_data[labels_values].astype('float64')
for label in labels_values:
    train_vectors = hstack((train_vectors, train_data[label].values.reshape(-1,1)))


In [38]:
# add the new columns to the test data
test_vectors = vectorizer.transform(test_data['clean_text'])
train_data[labels_values] = train_data[labels_values].astype('float64')
for label in labels_values:
    test_vectors = hstack((test_vectors, test_data[label].values.reshape(-1,1)))

# Logistic Regression Model (l1 regularization)

In [39]:
# initialize the Logistic Regression classifier
lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
# fit the classifier to the training data
lr_clf.fit(train_vectors, train_labels)

# Logistic Regression Model (Results)

In [40]:
train_pred = lr_clf.predict(train_vectors) # predict the labels for the training data
test_pred = lr_clf.predict(test_vectors) # predict the labels for the test data


In [41]:
train_acc = accuracy_score(train_labels, train_pred)
print(f"Training accuracy: {train_acc}")
test_acc = accuracy_score(test_labels, test_pred)
print(f"Test accuracy: {test_acc}")

# Calculate precision, recall, F1-score and support for each label on the test set
precision, recall, f1, support = precision_recall_fscore_support(test_labels, lr_pred)

# Calculate micro and macro averages for precision, recall, F1-score
micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(test_labels, lr_pred, average='micro')
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(test_labels, lr_pred, average='macro')
# Print results
print('Micro average')
print(f'Precision: {micro_precision:.4f}')
print(f'Recall: {micro_recall:.4f}')
print(f'F1-score: {micro_f1:.4f}')
print('\nMacro average')
print(f'Precision: {macro_precision:.4f}')
print(f'Recall: {macro_recall:.4f}')
print(f'F1-score: {macro_f1:.4f}')

Training accuracy: 0.7618556701030927
Test accuracy: 0.7324766355140186
Micro average
Precision: 0.7325
Recall: 0.7325
F1-score: 0.7325

Macro average
Precision: 0.3922
Recall: 0.3547
F1-score: 0.3552


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Naive Bayes Model 

In [42]:
# naive bayes
from sklearn.naive_bayes import MultinomialNB
# train the model
nb_clf = MultinomialNB()
# fit the model with the training data
nb_clf.fit(train_vectors, train_labels)

# Naive Bayes Model (Results)

In [43]:
# predict the target on the train dataset
train_pred = nb_clf.predict(train_vectors)
# Accuray Score on train dataset
train_acc = accuracy_score(train_labels, train_pred)
# predict the target on the test dataset
test_pred = nb_clf.predict(test_vectors)
# Accuracy Score on test dataset
test_acc = accuracy_score(test_labels, test_pred)
print(f"Training accuracy: {train_acc}")
print(f"Test accuracy: {test_acc}")
# Calculate micro and macro averages for precision, recall, F1-score
micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(test_labels, test_pred, average='micro')
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(test_labels, test_pred, average='macro')
# Print results
print('Micro average')
print(f'Precision: {micro_precision:.4f}')
print(f'Recall: {micro_recall:.4f}')
print(f'F1-score: {micro_f1:.4f}')
print('\nMacro average')
print(f'Precision: {macro_precision:.4f}')
print(f'Recall: {macro_recall:.4f}')
print(f'F1-score: {macro_f1:.4f}')

Training accuracy: 0.6308247422680412
Test accuracy: 0.616822429906542
Micro average
Precision: 0.6168
Recall: 0.6168
F1-score: 0.6168

Macro average
Precision: 0.1533
Recall: 0.0969
F1-score: 0.0980


  _warn_prf(average, modifier, msg_start, len(result))


# SVM

In [44]:
# SVM
svm_clf = SVC()
# fit the training dataset on the classifier
svm_clf.fit(train_vectors, train_labels)


# SVM (Results)

In [45]:
# predict the labels on validation dataset
train_pred = svm_clf.predict(train_vectors)
# Use accuracy_score function to get the accuracy
train_acc = accuracy_score(train_labels, train_pred)
# predict the labels on validation dataset
test_pred = svm_clf.predict(test_vectors)
# Use accuracy_score function to get the accuracy
test_acc = accuracy_score(test_labels, test_pred)
# Print results
print(f"Training accuracy: {train_acc}")
print(f"Test accuracy: {test_acc}")
# Calculate micro and macro averages for precision, recall, F1-score
micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(test_labels, test_pred, average='micro')
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(test_labels, test_pred, average='macro')
# Print results
print('Micro average')
print(f'Precision: {micro_precision:.4f}')
print(f'Recall: {micro_recall:.4f}')
print(f'F1-score: {micro_f1:.4f}')
print('\nMacro average')
print(f'Precision: {macro_precision:.4f}')
print(f'Recall: {macro_recall:.4f}')
print(f'F1-score: {macro_f1:.4f}')

Training accuracy: 0.8069072164948453
Test accuracy: 0.6711448598130841
Micro average
Precision: 0.6711
Recall: 0.6711
F1-score: 0.6711

Macro average
Precision: 0.3366
Recall: 0.2698
F1-score: 0.2857


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
