In [1]:
import os
import pandas as pd
import re
import nltk
import contractions
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load the CSV data into a DataFrame
csv_file = "combined_data.csv"
df = pd.read_csv(csv_file)
df.dropna(subset=['body'], inplace=True)
df.drop_duplicates(subset='body', inplace=True)
flagged_df = df[df['label'] != 'nonflagged']
non_flagged_df = df[df['label'] == 'nonflagged']

num_samples_per_class = 100000
# Randomly sample from each class to create a balanced dataset
flagged_sampled = flagged_df.sample(n=num_samples_per_class, random_state=50)
non_flagged_sampled = non_flagged_df.sample(n=num_samples_per_class, random_state=50)

# # Concatenate the sampled DataFrames to create the balanced dataset
df = pd.concat([flagged_sampled, non_flagged_sampled], ignore_index=True)
'''
Added 2 more pre processing steps.
'''

import pandas as pd
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
import contractions
def preprocess_text(text):
    if pd.isna(text) or not text.strip():
        return ''  # Replace empty strings or NaN with an empty string
    
    text = text.lower()
    text = re.sub(r'[^\w\s,]', '', text)
    text = re.sub(r'[0-9]', '', text)
    text = ''.join(char for char in text if ord(char) < 128)
    
    # Normalize contractions
    text = contractions.fix(text)
    
    # Tokenize the text using NLTK's word_tokenize
    words = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))  # Change 'english' to your language if needed
    words = [word for word in words if word not in stop_words]
    
    # Perform stemming and lemmatization
    '''
    Stemmer and lemmatizer are going to convert a word to it's root form.
    '''
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words]
    
    return ' '.join(words)

df['body'] = df['body'].apply(preprocess_text)
# Check the number of rows in the DataFrame after preprocessing
num_rows = df.shape[0]
print(f'Number of rows after preprocessing: {num_rows}')
# Label Encoding
df['label'] = df['label'].map({'deleted': 1, 'hidden': 1, 'nonflagged': 0})

Number of rows after preprocessing: 200000


In [18]:
# Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(df['body'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Feature Extraction using Count Vectorization
count_vectorizer = CountVectorizer(max_features=1000)  # Adjust max_features as needed
X_count = count_vectorizer.fit_transform(df['body'])

# Train-Test Split
X_train_count, X_test_count, y_train_count, y_test_count = train_test_split(X_count, df['label'], test_size=0.2, random_state=42)

In [13]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split

# Feature Extraction using Hash Vectorization
hash_vectorizer = HashingVectorizer(n_features=1000)  # Adjust n_features as needed
X_hashed = hash_vectorizer.fit_transform(df['body'])

# Train-Test Split
X_train_hashed, X_test_hashed, y_train_hashed, y_test_hashed = train_test_split(X_hashed, df['label'], test_size=0.2, random_state=42)

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

multinomial_naive_bayes_model = MultinomialNB(alpha=1500000,fit_prior = False)
multinomial_naive_bayes_model.fit(X_train, y_train)

# Predictions
y_pred = multinomial_naive_bayes_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print('Multinomial Naive Bayes Model Evaluation with FastText Vectorization:')
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(class_report)
print('Confusion Matrix:')
print(conf_matrix)

Multinomial Naive Bayes Model Evaluation with FastText Vectorization:
Accuracy: 0.5836
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.90      0.68     20007
           1       0.72      0.27      0.39     19993

    accuracy                           0.58     40000
   macro avg       0.64      0.58      0.54     40000
weighted avg       0.64      0.58      0.54     40000

Confusion Matrix:
[[17954  2053]
 [14603  5390]]


In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming you have df defined with 'body' and 'label' columns

# Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(df['body'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the range of values for alpha
param_grid = {
    'alpha': list(range(1500000,2000000, 10000)) ,
    'fit_prior': [True, False],
}

# Create a Multinomial Naive Bayes model
multinomial_naive_bayes_model = MultinomialNB()

# Perform the grid search
grid_search = GridSearchCV(multinomial_naive_bayes_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print('Best Hyperparameters:', best_params)
print('Multinomial Naive Bayes Model Evaluation with TF-IDF Vectorization:')
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(class_report)
print('Confusion Matrix:')
print(conf_matrix)

Best Hyperparameters: {'alpha': 1500000, 'fit_prior': False}
Multinomial Naive Bayes Model Evaluation with TF-IDF Vectorization:
Accuracy: 0.5836
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.90      0.68     20007
           1       0.72      0.27      0.39     19993

    accuracy                           0.58     40000
   macro avg       0.64      0.58      0.54     40000
weighted avg       0.64      0.58      0.54     40000

Confusion Matrix:
[[17954  2053]
 [14603  5390]]
