In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error , mean_absolute_error
import re
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

# Part 1: Language Modeling / Regression

### analysis

In [3]:
df = pd.read_csv("answers.csv")

In [4]:
print(df["id"].unique())
print(df["correct"].unique())

[ 1.1  1.2  1.3  1.4  1.5  1.6  1.7  2.1  2.2  2.3  2.4  2.5  2.6  2.7
  3.1  3.2  3.3  3.4  3.5  3.6  3.7  4.1  4.2  4.3  4.4  4.5  4.6  4.7
  5.1  5.2  5.3  5.4  6.1  6.2  6.3  6.4  6.5  6.6  6.7  7.1  7.2  7.3
  7.4  7.5  7.6  7.7  8.1  8.2  8.3  8.4  8.5  8.6  8.7  9.1  9.2  9.3
  9.4  9.5  9.6  9.7 10.1 10.2 10.3 10.4 10.5 10.6 10.7 11.1 11.2 11.3
 11.4 11.5 11.6 11.7 11.8 11.9 12.1 12.2 12.3 12.4 12.5 12.6 12.7 12.8
 12.9]
[0. 1.]


In [5]:
df.head()

Unnamed: 0,id,answer,score,correct
0,1.1,High risk problems are address in the prototyp...,3.5,0.0
1,1.1,To simulate portions of the desired final prod...,5.0,1.0
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0
4,1.1,It is used to let the users have a first idea ...,3.0,0.0


In [6]:
df.drop(columns=["id"],inplace=True)

## Pipeline 

In [7]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

    

[nltk_data] Downloading package punkt to /home/unamed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/unamed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/unamed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Initialize necessary components
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [9]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stop words removal
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Stemming
    tokens_stemmed = [ps.stem(word) for word in tokens]
    
    # Lemmatization
    tokens_lemmatized = [lemmatizer.lemmatize(word) for word in tokens_stemmed]
    
    # Join tokens back into a single string
    return ' '.join(tokens_lemmatized)

In [10]:
# Apply preprocessing
df['answer'] = df['answer'].apply(preprocess_text)
print(df)




                                                 answer  score  correct
0     high risk problem address prototyp program mak...    3.5      0.0
1     simul portion desir final product quick easi p...    5.0      1.0
2     prototyp program simul behavior portion desir ...    4.0      1.0
3     defin specif phase prototyp stimul behavior po...    5.0      1.0
4     use let user first idea complet program allow ...    3.0      0.0
...                                                 ...    ...      ...
2437                                              log n    5.0      1.0
2438                                     minu 1 divid 2    1.5      0.0
2439                                               2n-1    2.5      0.0
2440                      take h step , h height tree .    5.0      1.0
2441  depend instal search tree whatev case repeat b...    1.5      0.0

[2442 rows x 3 columns]


In [11]:
# Tokenize the answer column for Word2Vec
df['tokenized_answer'] = df['answer'].apply(nltk.word_tokenize)



In [12]:
df['tokenized_answer']

0       [high, risk, problem, address, prototyp, progr...
1       [simul, portion, desir, final, product, quick,...
2       [prototyp, program, simul, behavior, portion, ...
3       [defin, specif, phase, prototyp, stimul, behav...
4       [use, let, user, first, idea, complet, program...
                              ...                        
2437                                             [log, n]
2438                                  [minu, 1, divid, 2]
2439                                               [2n-1]
2440               [take, h, step, ,, h, height, tree, .]
2441    [depend, instal, search, tree, whatev, case, r...
Name: tokenized_answer, Length: 2442, dtype: object

In [13]:
# Word2Vec - CBOW
cbow_model = Word2Vec(sentences=df['tokenized_answer'], vector_size=100, window=5, min_count=1, sg=0)
# Word2Vec - Skip Gram 
skipgram_model = Word2Vec(sentences=df['tokenized_answer'], vector_size=100, window=5, min_count=1, sg=1)


In [14]:
def get_sentence_embedding(model, tokens):
    # Ensure tokens are in the model's vocabulary
    valid_tokens = [token for token in tokens if token in model.wv]
    if not valid_tokens:
        return [0] * model.vector_size
    # Compute the average of the word vectors
    return sum(model.wv[token] for token in valid_tokens) / len(valid_tokens)


In [15]:
# Apply the function to get embeddings
df['cbow_embedding'] = df['tokenized_answer'].apply(lambda tokens: get_sentence_embedding(cbow_model, tokens))
print("CBOW Sentence Embeddings:\n", df['cbow_embedding'])

CBOW Sentence Embeddings:
 0       [0.018987974, 0.22650962, 0.108676255, -0.2046...
1       [0.011788357, 0.18661962, 0.09015037, -0.16836...
2       [0.017538091, 0.21272016, 0.10427611, -0.19417...
3       [0.014323343, 0.19639497, 0.097186245, -0.1761...
4       [0.014706535, 0.18933487, 0.092767335, -0.1709...
                              ...                        
2437    [0.024387639, 0.17015204, 0.052350625, -0.1456...
2438    [0.021012291, 0.25003892, 0.10352617, -0.22135...
2439    [-0.008946666, 0.0018900299, -0.004067398, 0.0...
2440    [0.016542342, 0.24928032, 0.12825942, -0.21960...
2441    [0.012801052, 0.19975168, 0.10980461, -0.18189...
Name: cbow_embedding, Length: 2442, dtype: object


In [16]:
df['sg_embedding'] = df['tokenized_answer'].apply(lambda tokens: get_sentence_embedding(skipgram_model, tokens))
print("skip Gram Sentence Embeddings:\n", df['sg_embedding'])

skip Gram Sentence Embeddings:
 0       [0.0053549623, 0.0361633, 0.112275854, 0.01101...
1       [-0.013536243, 0.022897307, 0.11400955, 0.0120...
2       [0.0056831785, 0.040357243, 0.12781449, -0.005...
3       [-0.0006274767, 0.03175342, 0.120221324, 0.010...
4       [0.004510333, 0.035062883, 0.117898084, 0.0048...
                              ...                        
2437    [0.08700921, 0.16013557, -0.034487374, -0.0242...
2438    [0.020342002, 0.12897387, 0.007920422, -0.0231...
2439    [-0.008946666, 0.0018900299, -0.004067398, 0.0...
2440    [0.021881243, 0.0062250155, 0.13907202, 0.0275...
2441    [-0.016174663, -0.025009086, 0.14812441, 0.027...
Name: sg_embedding, Length: 2442, dtype: object


In [17]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['answer'])

print("Bag of Words Representation:\n", X_bow.toarray())
print("Feature Names:\n", vectorizer.get_feature_names_out())

Bag of Words Representation:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Feature Names:
 ['000000' '0the' '0x' ... 'your' 'zero' 'zillion']


In [18]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['answer'])

print("TF-IDF Representation:\n", X_tfidf.toarray())
print("Feature Names:\n", tfidf_vectorizer.get_feature_names_out())


TF-IDF Representation:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Feature Names:
 ['000000' '0the' '0x' ... 'your' 'zero' 'zillion']


In [19]:
# Prepare data for modeling

X = pd.DataFrame(df['cbow_embedding'].tolist())#df.drop(columns=["tokenized_answer","sg_embedding","answer","score"])
X.add(df["correct"], axis=1)
y = df["score"]
print("X shape \n :", X.shape)
print("y shape \n :", y.shape)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize models
svr_model = SVR()
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor()

# Train and evaluate SVR
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
svr_mse = mean_squared_error(y_test, y_pred_svr)
svr_mae = mean_absolute_error(y_test, y_pred_svr)
svr_cv_mse = -cross_val_score(svr_model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
svr_cv_mae = -cross_val_score(svr_model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()

# Train and evaluate Linear Regression
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_mae = mean_absolute_error(y_test, y_pred_linear)
linear_cv_mse = -cross_val_score(linear_model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
linear_cv_mae = -cross_val_score(linear_model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()

# Train and evaluate Decision Tree Regressor
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
tree_mse = mean_squared_error(y_test, y_pred_tree)
tree_mae = mean_absolute_error(y_test, y_pred_tree)
tree_cv_mse = -cross_val_score(tree_model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
tree_cv_mae = -cross_val_score(tree_model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()

# Print results
print("SVR Mean Squared Error:", svr_mse)
print("SVR Mean Absolute Error:", svr_mae)
print("SVR Cross-Validation Mean Squared Error:", svr_cv_mse)
print("SVR Cross-Validation Mean Absolute Error:", svr_cv_mae)

print("\nLinear Regression Mean Squared Error:", linear_mse)
print("Linear Regression Mean Absolute Error:", linear_mae)
print("Linear Regression Cross-Validation Mean Squared Error:", linear_cv_mse)
print("Linear Regression Cross-Validation Mean Absolute Error:", linear_cv_mae)

print("\nDecision Tree Mean Squared Error:", tree_mse)
print("Decision Tree Mean Absolute Error:", tree_mae)
print("Decision Tree Cross-Validation Mean Squared Error:", tree_cv_mse)
print("Decision Tree Cross-Validation Mean Absolute Error:", tree_cv_mae)

X shape 
 : (2442, 100)
y shape 
 : (2442,)
SVR Mean Squared Error: 1.7081355269385852
SVR Mean Absolute Error: 0.8831747887436259
SVR Cross-Validation Mean Squared Error: 1.501548671434419
SVR Cross-Validation Mean Absolute Error: 0.8098534144620568

Linear Regression Mean Squared Error: 1.1601443355595447
Linear Regression Mean Absolute Error: 0.8111186780711899
Linear Regression Cross-Validation Mean Squared Error: 1.5186704911120006
Linear Regression Cross-Validation Mean Absolute Error: 0.9528925043094615

Decision Tree Mean Squared Error: 1.9088484673010309
Decision Tree Mean Absolute Error: 0.9663818288051418
Decision Tree Cross-Validation Mean Squared Error: 2.2580108964649583
Decision Tree Cross-Validation Mean Absolute Error: 1.0633033971850225


In [20]:
X = pd.DataFrame(df['sg_embedding'].tolist())#df.drop(columns=["tokenized_answer","sg_embedding","answer","score"])
X.add(df["correct"], axis=1)
y = df["score"]
print("X shape \n :", X.shape)
print("y shape \n :", y.shape)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize models
svr_model = SVR()
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor()

# Train and evaluate SVR
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
svr_mse = mean_squared_error(y_test, y_pred_svr)
svr_mae = mean_absolute_error(y_test, y_pred_svr)
svr_cv_mse = -cross_val_score(svr_model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
svr_cv_mae = -cross_val_score(svr_model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()

# Train and evaluate Linear Regression
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_mae = mean_absolute_error(y_test, y_pred_linear)
linear_cv_mse = -cross_val_score(linear_model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
linear_cv_mae = -cross_val_score(linear_model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()

# Train and evaluate Decision Tree Regressor
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
tree_mse = mean_squared_error(y_test, y_pred_tree)
tree_mae = mean_absolute_error(y_test, y_pred_tree)
tree_cv_mse = -cross_val_score(tree_model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
tree_cv_mae = -cross_val_score(tree_model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()

# Print results
print("SVR Mean Squared Error:", svr_mse)
print("SVR Mean Absolute Error:", svr_mae)
print("SVR Cross-Validation Mean Squared Error:", svr_cv_mse)
print("SVR Cross-Validation Mean Absolute Error:", svr_cv_mae)

print("\nLinear Regression Mean Squared Error:", linear_mse)
print("Linear Regression Mean Absolute Error:", linear_mae)
print("Linear Regression Cross-Validation Mean Squared Error:", linear_cv_mse)
print("Linear Regression Cross-Validation Mean Absolute Error:", linear_cv_mae)

print("\nDecision Tree Mean Squared Error:", tree_mse)
print("Decision Tree Mean Absolute Error:", tree_mae)
print("Decision Tree Cross-Validation Mean Squared Error:", tree_cv_mse)
print("Decision Tree Cross-Validation Mean Absolute Error:", tree_cv_mae)

X shape 
 : (2442, 100)
y shape 
 : (2442,)
SVR Mean Squared Error: 1.4885073364180312
SVR Mean Absolute Error: 0.8345361108794681
SVR Cross-Validation Mean Squared Error: 1.400808093781913
SVR Cross-Validation Mean Absolute Error: 0.8198080003435667

Linear Regression Mean Squared Error: 1.125377583937165
Linear Regression Mean Absolute Error: 0.803104527467518
Linear Regression Cross-Validation Mean Squared Error: 1.5284370571591552
Linear Regression Cross-Validation Mean Absolute Error: 0.9543217870264348

Decision Tree Mean Squared Error: 1.888172290388548
Decision Tree Mean Absolute Error: 0.9471370143149284
Decision Tree Cross-Validation Mean Squared Error: 2.4156431014086346
Decision Tree Cross-Validation Mean Absolute Error: 1.0920047131201964


### Conclusion Part 1:

SVR seems to be the most stable model with consistent performance across both embedding methods.
Linear Regression and Decision Tree show higher variance in performance, with Skip Gram embeddings generally yielding better results.

# Part 2: Language Modeling / Classification

In [21]:
dTr = pd.read_csv("twitter_training.csv",names=['id','Entity',"sentiment","text"])
dTe = pd.read_csv("twitter_validation.csv",names=['id','Entity',"sentiment","text"])

In [22]:
dTr.drop(columns=["id"],inplace=True)
dTe.drop(columns=["id"],inplace=True)

In [23]:
def preprocess_text(text):
    if isinstance(text, float):
        text = ""  # Convert NaN or non-string to an empty string
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split()
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply the preprocessing function to the training and validation data
dTr['text'] = dTr['text'].apply(preprocess_text)
dTe['text'] = dTe['text'].apply(preprocess_text)


In [24]:
# Combine training and validation data for training Word2Vec
combined_data = pd.concat([dTr['text'], dTe['text']])

# Train Word2Vec model
w2v_model = Word2Vec(sentences=combined_data, vector_size=100, window=5, min_count=1, sg=1)

# Get the vocabulary size
vocab_size = len(w2v_model.wv.key_to_index)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 27


### TF-IDF

In [25]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(dTr['text']).toarray()

# Transform the validation data
X_test_tfidf = tfidf_vectorizer.transform(dTe['text']).toarray()

print(X_train_tfidf)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [26]:
def get_average_word2vec(tokens, model, vector_size):
    if len(tokens) < 1:
        return np.zeros(vector_size)
    vectorized = [model.wv[word] for word in tokens if word in model.wv.key_to_index]
    if len(vectorized) < 1:
        return np.zeros(vector_size)
    return np.mean(vectorized, axis=0)

# Apply the function to get embeddings for training and validation data
X_train = np.array([get_average_word2vec(tokens, w2v_model, 100) for tokens in dTr['text']])
X_test = np.array([get_average_word2vec(tokens, w2v_model, 100) for tokens in dTe['text']])

# Target variable
y_train = dTr['sentiment']  # Assuming the sentiment column is the target
y_test = dTe['sentiment']


In [27]:
# Initialize models
svm_model = SVC()
logistic_model = LogisticRegression(max_iter=1000)
tree_model = DecisionTreeClassifier()



In [28]:
# Train and evaluate Logistic Regression
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logistic))



Logistic Regression Accuracy: 0.346
Logistic Regression Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.00      0.00      0.00       172
    Negative       0.31      0.66      0.42       266
     Neutral       0.43      0.29      0.35       285
    Positive       0.37      0.32      0.34       277

    accuracy                           0.35      1000
   macro avg       0.28      0.32      0.28      1000
weighted avg       0.31      0.35      0.30      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# Train and evaluate Decision Tree
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_tree))


Decision Tree Accuracy: 0.874
Decision Tree Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.84      0.91      0.87       172
    Negative       0.84      0.91      0.87       266
     Neutral       0.91      0.85      0.87       285
    Positive       0.91      0.84      0.87       277

    accuracy                           0.87      1000
   macro avg       0.87      0.88      0.87      1000
weighted avg       0.88      0.87      0.87      1000



In [30]:
# Initialize Min-Max Scaler
scaler = MinMaxScaler()

# Scale the word vectors
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
nb_model = MultinomialNB()
# Train and evaluate Decision Tree
nb_model.fit(X_train_scaled, y_train)
y_pred_nb = nb_model.predict(X_test_scaled)
# Evaluate the model
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.271
Naive Bayes Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.00      0.00      0.00       172
    Negative       0.27      1.00      0.42       266
     Neutral       0.00      0.00      0.00       285
    Positive       1.00      0.02      0.04       277

    accuracy                           0.27      1000
   macro avg       0.32      0.25      0.11      1000
weighted avg       0.35      0.27      0.12      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Conclusion Part 2

Based on the performance metrics, the Decision Tree model significantly outperformed both the Logistic Regression and Naive Bayes models in terms of accuracy and f1-score across all classes. The Decision Tree achieved an accuracy of 86.6% and consistently high precision, recall, and f1-scores for all sentiment categories. In contrast, Logistic Regression and Naive Bayes struggled particularly with the "Irrelevant" and "Neutral" classes, showing poor precision and recall. 

Therefore, for this sentiment analysis task using Skip-Gram embeddings, the Decision Tree model is the most effective, providing the best balance between precision and recall across all sentiment categories.
