In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
import joblib
# Download the WordNet resource
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


KeyboardInterrupt



In [None]:
training=pd.read_csv("twitter_training.csv")
validation=pd.read_csv("twitter_validation.csv")

In [None]:
training.columns=["id","branch","sentiment","tweet"]

In [None]:
training=training.drop('id',axis=1)

In [None]:
training = training.dropna()

In [None]:
validation.columns=["id","branch","sentiment","tweet"]


In [None]:
validation=validation.drop('id',axis=1)

In [None]:
# Cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text


In [None]:
# Tokenization, stop words removal, stemming, and lemmatization
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply cleaning and preprocessing
training['cleaned_text'] = training['tweet'].apply(clean_text)
training['processed_text'] = training['cleaned_text'].apply(preprocess_text)


In [None]:
training

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(training['processed_text'], training['sentiment'], test_size=0.2, random_state=42)

In [None]:
import numpy as np
from gensim.models import Word2Vec

# Tokenize for Word2Vec
tokenized_sentences = [sentence.split() for sentence in X_train]

# Word2Vec model (CBOW)
cbow_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4, sg=0)

# Word2Vec model (Skip-gram)
skipgram_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Get average word vectors for sentences, ensuring handling of empty slices
def get_avg_word2vec(sentence, model):
    words = sentence.split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vecs, axis=0)

# Applying the function and ensuring homogeneous arrays
X_train_cbow = np.array([get_avg_word2vec(sentence, cbow_model) for sentence in X_train])
X_test_cbow = np.array([get_avg_word2vec(sentence, cbow_model) for sentence in X_test])

X_train_skipgram = np.array([get_avg_word2vec(sentence, skipgram_model) for sentence in X_train])
X_test_skipgram = np.array([get_avg_word2vec(sentence, skipgram_model) for sentence in X_test])

print(f"X_train_cbow shape: {X_train_cbow.shape}")
print(f"X_test_cbow shape: {X_test_cbow.shape}")
print(f"X_train_skipgram shape: {X_train_skipgram.shape}")
print(f"X_test_skipgram shape: {X_test_skipgram.shape}")


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    report = classification_report(y_test, y_pred)
    return acc, f1, report

In [None]:
# Modèles séparés
model_SVM = SVC()
model_NaiveBayes = MultinomialNB()
model_LogisticRegression = LogisticRegression(max_iter=200)
model_AdaBoost = AdaBoostClassifier()

In [None]:
print("Evaluating Naive Bayes with CBOW Word2Vec")
results_NaiveBayes_cbow = train_and_evaluate(model_NaiveBayes, X_train_cbow, X_test_cbow, y_train, y_test)

print("Evaluating Naive Bayes with Skip-gram Word2Vec")
results_NaiveBayes_skipgram = train_and_evaluate(model_NaiveBayes, X_train_skipgram, X_test_skipgram, y_train, y_test)

print("Evaluating Naive Bayes with BOW")
results_NaiveBayes_bow = train_and_evaluate(model_NaiveBayes, X_train_bow, X_test_bow, y_train, y_test)

print("Evaluating Naive Bayes with TF-IDF")
results_NaiveBayes_tfidf = train_and_evaluate(model_NaiveBayes, X_train_tfidf, X_test_tfidf, y_train, y_test)


In [None]:
# Logistic Regression
print("Evaluating Logistic Regression with CBOW Word2Vec")
results_LogisticRegression_cbow = train_and_evaluate(model_LogisticRegression, X_train_cbow, X_test_cbow, y_train, y_test)

print("Evaluating Logistic Regression with Skip-gram Word2Vec")
results_LogisticRegression_skipgram = train_and_evaluate(model_LogisticRegression, X_train_skipgram, X_test_skipgram, y_train, y_test)

print("Evaluating Logistic Regression with BOW")
results_LogisticRegression_bow = train_and_evaluate(model_LogisticRegression, X_train_bow, X_test_bow, y_train, y_test)

print("Evaluating Logistic Regression with TF-IDF")
results_LogisticRegression_tfidf = train_and_evaluate(model_LogisticRegression, X_train_tfidf, X_test_tfidf, y_train, y_test)


In [None]:
# AdaBoost
print("Evaluating AdaBoost with CBOW Word2Vec")
results_AdaBoost_cbow = train_and_evaluate(model_AdaBoost, X_train_cbow, X_test_cbow, y_train, y_test)

print("Evaluating AdaBoost with Skip-gram Word2Vec")
results_AdaBoost_skipgram = train_and_evaluate(model_AdaBoost, X_train_skipgram, X_test_skipgram, y_train, y_test)

print("Evaluating AdaBoost with BOW")
results_AdaBoost_bow = train_and_evaluate(model_AdaBoost, X_train_bow, X_test_bow, y_train, y_test)

print("Evaluating AdaBoost with TF-IDF")
results_AdaBoost_tfidf = train_and_evaluate(model_AdaBoost, X_train_tfidf, X_test_tfidf, y_train, y_test)


In [None]:
results = {
    # 'SVM_cbow': results_SVM_cbow,
    # 'SVM_skipgram': results_SVM_skipgram,
    # 'SVM_bow': results_SVM_bow,
    # 'SVM_tfidf': results_SVM_tfidf,
    # 'NaiveBayes_cbow': results_NaiveBayes_cbow,
    # 'NaiveBayes_skipgram': results_NaiveBayes_skipgram,
    # 'NaiveBayes_bow': results_NaiveBayes_bow,
    # 'NaiveBayes_tfidf': results_NaiveBayes_tfidf,
    'LogisticRegression_cbow': results_LogisticRegression_cbow,
    'LogisticRegression_skipgram': results_LogisticRegression_skipgram,
    'LogisticRegression_bow': results_LogisticRegression_bow,
    'LogisticRegression_tfidf': results_LogisticRegression_tfidf,
    'AdaBoost_cbow': results_AdaBoost_cbow,
    'AdaBoost_skipgram': results_AdaBoost_skipgram,
    'AdaBoost_bow': results_AdaBoost_bow,
    'AdaBoost_tfidf': results_AdaBoost_tfidf
}

In [None]:
for key, value in results.items():
    print(f"Model: {key}\nAccuracy: {value[0]}\nF1 Score: {value[1]}\nReport:\n{value[2]}\n")

# Example of interpreting results:
best_model = max(results, key=lambda k: results[k][1])  # Choose the model with the highest F1 score
print(f"The best model is {best_model} with F1 Score of {results[best_model][1]}")