# AML Midterm
## Team Members:
Matthew Maitland, Sofia Beyerlein, Shreeya Indap

In [151]:
# import packages

import sklearn
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.mixture import GaussianMixture
import scipy
import contractions

# Data Preprocessing Steps

In [152]:
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

In [153]:
X_train = pd.DataFrame(train_data)
X_test = pd.DataFrame(test_data)
X_dev = pd.DataFrame(val_data)
X_train['Phrase'] = train_data['Phrase'].str.lower()
X_dev['Phrase'] = val_data['Phrase'].str.lower()
X_test['Phrase'] = test_data['Phrase'].str.lower()

In [154]:
X_train = X_train.dropna(subset=['Phrase'])
X_dev = X_dev.dropna(subset=['Phrase'])

# drop rows with NaN 
X_train = X_train.dropna()
X_dev = X_dev.dropna()
X_test['Phrase'] = X_test['Phrase'].fillna("")


In [155]:
wnl = WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join([wnl.lemmatize(word) for word in text.split()])

In [156]:
X_train['Phrase'] = X_train['Phrase'].apply(lemmatize_text)
X_dev['Phrase'] = X_dev['Phrase'].apply(lemmatize_text)
X_test['Phrase'] = X_test['Phrase'].apply(lemmatize_text)

In [157]:
X_train['Phrase'] = X_train['Phrase'].str.replace(f"[{string.punctuation}]", "", regex=True)
X_dev['Phrase'] = X_dev['Phrase'].str.replace(f"[{string.punctuation}]", "", regex=True)
X_test['Phrase'] = X_test['Phrase'].str.replace(f"[{string.punctuation}]", "", regex=True)

In [158]:
# expand contractions
X_train['Phrase'] = X_train['Phrase'].apply(lambda x: contractions.fix(x))
X_dev['Phrase'] = X_dev['Phrase'].apply(lambda x: contractions.fix(x))
X_test['Phrase'] = X_test['Phrase'].apply(lambda x: contractions.fix(x))

In [159]:
def clean(text):
    """
    From assignment template code
    """
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    #added substitutions

    #***********added substitutions***********
    # remove all the special characters
    texter = re.sub(r'\W', ' ', texter)
    # remove all single characters
    texter = re.sub(r'\s+[a-zA-Z]\s+', ' ', texter)
    # Remove single characters from the start
    texter = re.sub(r'\^[a-zA-Z]\s+', ' ', texter)
    # Remove numbers
    texter = re.sub(r'\d+', ' ', texter)
    # Converting to Lowercase
    texter = texter.lower()
    # Remove punctuation
    texter = re.sub(r'[^\w\s]', ' ', texter)
    # Remove parentheses
    texter = re.sub(r'\([^)]*\)', ' ', texter)
    # Remove single quotes
    texter = re.sub(r'\'', ' ', texter)
    # Substituting multiple spaces with single space
    texter = re.sub(r'\s+', ' ', texter, flags=re.I)

    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

In [160]:
X_train['Phrase'] = X_train['Phrase'].apply(clean)
X_dev['Phrase'] = X_dev['Phrase'].apply(clean)
X_test['Phrase'] = X_test['Phrase'].apply(clean)

In [161]:
# split into labeled and unlabeled dfs
X_train_unlabeled = X_train[X_train['Sentiment']==-100]
X_train_labeled = X_train[X_train['Sentiment']!=-100]

In [162]:
print(len(X_train_unlabeled),len(X_train_labeled))
print(len(X_train_unlabeled) + len(X_train_labeled))

34948 24753
59701


# TF-IDF Tri-Gram Vectorization

In [163]:
# TF-IDF vect w n-grams = 3
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 3))
X_train_labeled_vect = vectorizer.fit_transform(X_train_labeled['Phrase'])
X_train_unlabeled_vect = vectorizer.transform(X_train_unlabeled['Phrase'])

In [164]:
X_train_combined_vect = scipy.sparse.vstack([X_train_labeled_vect, X_train_unlabeled_vect])

# Model Before Augmentation

In [167]:
# Logreg on only labeled data
clf = LogisticRegression(max_iter=3000, random_state=0, class_weight='balanced')  
clf.fit(X_train_labeled_vect, X_train_labeled['Sentiment'])

LogisticRegression(class_weight='balanced', max_iter=3000, random_state=0)

In [168]:
# Vectorize 
X_val_vect = vectorizer.transform(X_dev['Phrase'])
y_val = X_dev['Sentiment']

# preds
y_pred_val = clf.predict(X_val_vect)

# Metrics
accuracy = accuracy_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val, average='weighted')
print(f"Validation Accuracy before Augmentation: {accuracy:.4f}")
print(f"Validation F1 Score before Augmentation: {f1:.4f}")

Validation Accuracy before Augmentation: 0.9366
Validation F1 Score before Augmentation: 0.9365


# GMM for Data Augmentation

Note: This is not needed to validate results, as the final model does not use the GMM.

In [169]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))  # smaller vectorizer b/c GMM is slow
X_train_labeled_vect = vectorizer.fit_transform(X_train_labeled['Phrase'])
X_train_unlabeled_vect = vectorizer.transform(X_train_unlabeled['Phrase'])

In [170]:
# fit GMM/get cluster labels for labeled data
gmm = GaussianMixture(n_components=5, random_state=0)  
gmm.fit(X_train_labeled_vect.toarray())
labeled_cluster_assignments = gmm.predict(X_train_labeled_vect.toarray())

# map to classes
cluster_to_class_mapping = {}
for cluster in np.unique(labeled_cluster_assignments):
    cluster_indices = np.where(labeled_cluster_assignments == cluster)[0]
    common_sentiment = Counter(X_train_labeled.iloc[cluster_indices]['Sentiment']).most_common(1)[0][0]
    cluster_to_class_mapping[cluster] = common_sentiment

# predict
unlabeled_cluster_assignments = gmm.predict(X_train_unlabeled_vect.toarray())

# map
X_train_unlabeled['Sentiment'] = [cluster_to_class_mapping[cluster] for cluster in unlabeled_cluster_assignments]

# Combine w/ originally labeled
X_train_augmented = pd.concat([X_train_labeled, X_train_unlabeled])
y_train_augmented = X_train_augmented['Sentiment']

# refit and evaluate 
X_train_augmented_vect = vectorizer.transform(X_train_augmented['Phrase'])
clf.fit(X_train_augmented_vect, y_train_augmented)
X_val_vect = vectorizer.transform(X_dev['Phrase'])
y_pred_val = clf.predict(X_val_vect)

accuracy = accuracy_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val, average='weighted')
print(f"Validation Accuracy with GMM Augmented Data: {accuracy:.4f}")
print(f"Validation F1 Score with GMM Augmented Data: {f1:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Validation Accuracy with GMM Augmented Data: 0.8426
Validation F1 Score with GMM Augmented Data: 0.8438


# Iterative Multinomial LogReg for Data Augmentation

In [176]:
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 3))  # back to big vectorizer
X_train_labeled_vect = vectorizer.fit_transform(X_train_labeled['Phrase'])
X_train_unlabeled_vect = vectorizer.transform(X_train_unlabeled['Phrase'])

In [177]:
thresholds = [.99,.98,.97,.96,.95,.9,.85,.8]

# init w/ original labeled data
X_train_augmented = X_train_labeled.copy()  

# track to prevent duplicates
added_indices = set(X_train_augmented.index)

for threshold in thresholds:
    print(f"Processing with threshold: {threshold}")

    # retrain on on the current augmented set
    X_train_augmented_vect = vectorizer.transform(X_train_augmented['Phrase'])
#     X_train_augmented_scaled = scaler.transform(X_train_augmented_vect)
    clf.fit(X_train_augmented_vect, X_train_augmented['Sentiment'])

    # get preds and probs for unlabeled 
    y_pred_probs = clf.predict_proba(X_train_unlabeled_vect)
    high_confidence_mask = y_pred_probs.max(axis=1) >= threshold
    y_pred_high_conf = clf.predict(X_train_unlabeled_vect[high_confidence_mask])

    # only select high-confidence rows --> assign predicted labels
    X_train_high_conf = X_train_unlabeled.loc[high_confidence_mask].copy()
#     print(len(X_train_high_conf))
    X_train_high_conf['Sentiment'] = y_pred_high_conf

    # filter out duplicates
    X_train_high_conf = X_train_high_conf.loc[~X_train_high_conf.index.isin(added_indices)]

    # add to set 
    added_indices.update(X_train_high_conf.index)
    
    X_train_augmented = pd.concat([X_train_augmented, X_train_high_conf])


print("Final number of rows in X_train_augmented:", len(X_train_augmented))


Processing with threshold: 0.99
Processing with threshold: 0.98
Processing with threshold: 0.97
Processing with threshold: 0.96
Processing with threshold: 0.95
Processing with threshold: 0.9
Processing with threshold: 0.85
Processing with threshold: 0.8
Final number of rows in X_train_augmented: 53374


In [178]:
# combine data with high-confidence predictions
y_train_augmented = X_train_augmented['Sentiment']

In [179]:
# retrain for evaluation
X_train_augmented_vect = vectorizer.transform(X_train_augmented['Phrase'])
clf.fit(X_train_augmented_vect, y_train_augmented)

LogisticRegression(class_weight='balanced', max_iter=3000, random_state=0)

In [180]:
# predict and evaluate on dev set
X_val_vect = vectorizer.transform(X_dev['Phrase'])
y_val = X_dev['Sentiment']

y_pred_val = clf.predict(X_val_vect)

accuracy = accuracy_score(y_val, y_pred_val)
f1 = f1_score(y_val, y_pred_val, average='weighted')

print(f"Validation Accuracy with Augmented Data: {accuracy:.4f}")
print(f"Validation F1 Score with Augmented Data: {f1:.4f}")


Validation Accuracy with Augmented Data: 0.9383
Validation F1 Score with Augmented Data: 0.9384


# Final Model

In [181]:
# add X_dev for the final model training
X_train_final = pd.concat([X_train_augmented, X_dev])  
y_train_final = X_train_final['Sentiment']

# vectorize 
X_train_final_vect = vectorizer.fit_transform(X_train_final['Phrase'])  # refit vectorizer for full data

# Train
clf.fit(X_train_final_vect, y_train_final)

# Predict
X_test_vect = vectorizer.transform(X_test['Phrase'])  
y_pred_test_final = clf.predict(X_test_vect)

submission_test_final = pd.DataFrame({
    "PhraseID": X_test.index, 
    "Sentiment": y_pred_test_final
})
submission_test_final.to_csv("submission_test_final.csv", index=False)

submission_test_final.head()


Unnamed: 0,PhraseID,Sentiment
0,0,3
1,1,2
2,2,4
3,3,1
4,4,4
