# PA3b Sentiment Classification
#### Applied Machine Learning
Grpup 39: Sebastian Kölbel & Min Ze Teh

We begin by importing libraries needed for the task

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# for converting training and test datasets into matrices
from sklearn.feature_extraction.text import TfidfVectorizer

We create a function for preprocessing the csv-files. Thes preprocessing steps include:
* Making text all lower case
* Removing leading and closing white-space
* Make sure punctuations are separated from words

In [2]:
def read_sentiments(doc_file):
    with open(doc_file, 'r', encoding='utf-8') as f:
        new_lines = []
        for line in f:
            line = line.lower().removesuffix('\n').strip().replace('.',' . ')
            new_lines.append(line.split('\t'))
                
        return pd.DataFrame(new_lines,columns=['sentiment','text'])

We then separate the input values from the output values. The crowdsourced data needed some more preprocessing. There we too many values for the sentiments all consisting of typos. We checked how many errors there were, however there were only 75 errors out of more than 10 000 rows so the errors were dropped to save time.

In [3]:
df = read_sentiments('Data/crowdsourced_train.csv').drop(0)
cs_training = df[df['sentiment'].isin(['positive','negative','neutral'])]
print('Dropped rows in crowdsourced data:',len(df)-len(cs_training))
X_cs_train = cs_training.drop('sentiment', axis=1)
Y_cs_train = cs_training['sentiment']

gold_training = read_sentiments('Data/gold_train.csv').drop(0)
X_gold_train = gold_training.drop('sentiment', axis=1)
Y_gold_train = gold_training['sentiment']


testing = read_sentiments('Data/test.csv').drop(0)
X_test = testing.drop('sentiment', axis=1)
Y_test = testing['sentiment']

Dropped rows in crowdsourced data: 75


Using Tfidf vectorizer and LinearSVC

In [23]:
# def train_document_classifier(X, Y):
#     pipeline = make_pipeline( TfidfVectorizer(), LinearSVC(dual='auto') )
#     pipeline.fit(X, Y)
#     return pipeline

# def train_document_classifier(X, Y):
#     pipeline = make_pipeline(TfidfVectorizer(), LinearSVC(penalty='l2', loss='squared_hinge', dual=False))
#     pipeline.fit(X, Y)
#     return pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

def train_document_classifier(X, Y):
    # Define the pipeline with TfidfVectorizer and LinearSVC
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC())
    ])
    
    # Define the parameter grid
    param_grid = {
        'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
        'tfidf__max_features': [1000, 5000, 10000],  # Maximum number of features
        'clf__C': [0.1, 1, 10]  # Regularization parameter
    }
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X, Y)
    
    # Return the best trained pipeline
    return grid_search.best_estimator_


In [24]:
from sklearn.metrics import classification_report

# Convert the DataFrame column containing text data into a list of strings
X_cs_train_text = X_cs_train['text'].tolist()

# Now X_cs_train_text is a list of strings representing the text data for each document
# Train the pipeline
pipeline = train_document_classifier(X_cs_train_text, Y_cs_train)

X_test_text = X_test['text'].tolist()

# Test the trained pipeline
Y_pred = pipeline.predict(X_test_text)

# Evaluate the performance
print(classification_report(Y_test, Y_pred))




              precision    recall  f1-score   support

    negative       0.58      0.40      0.47      1077
     neutral       0.59      0.82      0.69      2597
    positive       0.73      0.47      0.57      1850

    accuracy                           0.62      5524
   macro avg       0.64      0.56      0.58      5524
weighted avg       0.64      0.62      0.61      5524



In [25]:
from sklearn.metrics import classification_report

# Convert the DataFrame column containing text data into a list of strings
X_gold_train_text = X_gold_train['text'].tolist()

# Now X_gold_train_text is a list of strings representing the text data for each document
# Train the pipeline
pipeline = train_document_classifier(X_gold_train_text, Y_gold_train)

X_test_text = X_test['text'].tolist()

# Test the trained pipeline
Y_pred = pipeline.predict(X_test_text)

# Evaluate the performance
print(classification_report(Y_test, Y_pred))




              precision    recall  f1-score   support

    negative       0.77      0.29      0.42      1077
     neutral       0.62      0.86      0.72      2597
    positive       0.74      0.61      0.67      1850

    accuracy                           0.67      5524
   macro avg       0.71      0.59      0.60      5524
weighted avg       0.69      0.67      0.65      5524



Using Tfidf vectorizer and Logistic Regression

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

def train_document_classifier_log_reg(X, Y):
    # Define the pipeline with TfidfVectorizer and LogisticRegression
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])
    
    # Define the parameter grid
    param_grid = {
        'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
        'tfidf__max_features': [1000, 5000, 10000],  # Maximum number of features
        'clf__C': [0.1, 1, 10]  # Regularization parameter
    }
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X, Y)
    
    # Return the best trained pipeline
    return grid_search.best_estimator_


In [29]:
from sklearn.metrics import classification_report

# Convert the DataFrame column containing text data into a list of strings
X_cs_train_text = X_cs_train['text'].tolist()

# Now X_cs_train_text is a list of strings representing the text data for each document
# Train the pipeline
pipeline = train_document_classifier_log_reg(X_cs_train_text, Y_cs_train)

X_test_text = X_test['text'].tolist()

# Test the trained pipeline
Y_pred = pipeline.predict(X_test_text)

# Evaluate the performance
print(classification_report(Y_test, Y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

              precision    recall  f1-score   support

    negative       0.55      0.47      0.51      1077
     neutral       0.60      0.77      0.68      2597
    positive       0.72      0.49      0.58      1850

    accuracy                           0.62      5524
   macro avg       0.62      0.58      0.59      5524
weighted avg       0.63      0.62      0.61      5524



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
from sklearn.metrics import classification_report

# Convert the DataFrame column containing text data into a list of strings
X_gold_train_text = X_gold_train['text'].tolist()

# Now X_gold_train_text is a list of strings representing the text data for each document
# Train the pipeline
pipeline = train_document_classifier_log_reg(X_gold_train_text, Y_gold_train)

X_test_text = X_test['text'].tolist()

# Test the trained pipeline
Y_pred = pipeline.predict(X_test_text)

# Evaluate the performance
print(classification_report(Y_test, Y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

              precision    recall  f1-score   support

    negative       0.73      0.40      0.51      1077
     neutral       0.65      0.84      0.73      2597
    positive       0.75      0.65      0.69      1850

    accuracy                           0.69      5524
   macro avg       0.71      0.63      0.65      5524
weighted avg       0.70      0.69      0.68      5524



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
