# Text Classification (Mini Project 2)
Team Members:
- 1. Mao Hieng
- 2. Vin Samdy
- 3. Som Sokleap

In [2]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

## Load data

In [3]:
# Load files
def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

In [60]:
positive_reviews = load_file('positive-reviews.txt')
negative_reviews = load_file('negative-reviews.txt')

print(len(positive_reviews), len(negative_reviews))
positive_reviews[:5], negative_reviews[:5]

20000 20000


(['Size, Size, ans Size.\n',
  'Excellent quality, speedy printing, low cost\n',
  'Cheap, good quality, small size\n',
  'Attractive design, satisfying features, the backlight!\n',
  'Pretty much has every feature you could possibly need, great look\n'],
 ['Image quality not as good as some other brands, cheap feel to body.\n',
  'nothing\n',
  'Black text could be better, ink runs out kind of fast\n',
  "The thing won't work\n",
  'Display grainy, small keypad for text messaging, poor vibrate function\n'])

## Creating Features

In [61]:
positive_words = set(load_file('positive-words.txt'))
negative_words = set(load_file('negative-words.txt'))

print(len(positive_words), len(negative_words))

2006 4780


In [6]:
def extract_features(reviews):
    features = []
    for i, review in enumerate(reviews):
        review = review.lower()
        tokens = re.findall(r'\b\w+\b', review)

        pos_count = sum(1 for word in tokens if word in positive_words)
        neg_count  = sum(1 for word in tokens if word in negative_words)
        contains_no = int('no' in tokens)
        pronoun_count = sum(1 for word in tokens if word in ['i', 'me', 'my', 'you', 'your'])
        contains_exclamation = int('!' in review)
        log_length = np.log(len(tokens) + 1)

        features.append([pos_count, neg_count, contains_no, pronoun_count, contains_exclamation, log_length])
    
    return np.array(features)

## Preparing Data

In [7]:
positive_labels = [1] * len(positive_reviews)
negative_labels = [0] * len(negative_reviews)

reviews = positive_reviews + negative_reviews
labels = positive_labels + negative_labels

## Normal Features

In [8]:
X = extract_features(reviews)
print(X.shape)
print(X[:5])
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

(40000, 6)
[[0.         0.         0.         0.         0.         1.60943791]
 [0.         0.         0.         0.         0.         1.94591015]
 [0.         0.         0.         0.         0.         1.79175947]
 [0.         0.         0.         0.         1.         1.94591015]
 [0.         0.         0.         1.         0.         2.48490665]]


## Train and evaluate models

In [9]:
import joblib
import os

def train_and_evaluate_model(model, model_name, trainset, testset):
    model_exists = os.path.exists(f"{model_name}.pkl")
    if model_exists:
        model = joblib.load(f"{model_name}.pkl")
    else:
        model.fit(trainset, y_train)
    
    y_pred = model.predict(testset)
    accuracy = accuracy_score(y_test, y_pred)
    
    if not model_exists:
        joblib.dump(model, f"{model_name}.pkl")
    print(f"Accuracy of {model_name}: {accuracy:.4f}")

In [37]:
models = [
    (LogisticRegression(), "Logistic Regression Default Features"),
    (RandomForestClassifier(), "Random Forest Default Features"),
    (MultinomialNB(), "Naive Bayes classifier Default Features"),
    (SVC(), "Support Vector Machine Default Features")
]

In [38]:
for model, model_name in models:
    train_and_evaluate_model(model, model_name, X_train, X_test)

Accuracy of Logistic Regression Default Features: 0.5919
Accuracy of Random Forest Default Features: 0.5952
Accuracy of Naive Bayes classifier Default Features: 0.5715
Accuracy of Support Vector Machine Default Features: 0.5904


## More Features

### Apply TF-IDF

In [None]:
tfidf_max_features = 5000

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)
tfidf_features = tfidf_vectorizer.fit_transform(reviews).toarray()

In [13]:
print(tfidf_features.shape)

(40000, 5000)


In [14]:
X2 = np.hstack([X, tfidf_features])
print(X2.shape)
X2_train, X2_test, y_train, y_test = train_test_split(X2, labels, test_size=0.2, random_state=42)

(40000, 5006)


In [15]:
tfidf_models = [
    (LogisticRegression(), "Logistic Regression TF-IDF Features"),
    (RandomForestClassifier(), "Random Forest TF-IDF Features"),
    (MultinomialNB(), "Naive Bayes classifier TF-IDF Features"),
    # (SVC(), "Support Vector Machine TF-IDF Features")
]

In [39]:
for model, model_name in tfidf_models:
    train_and_evaluate_model(model, model_name, X2_train, X2_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of Logistic Regression TF-IDF Features: 0.9300
Accuracy of Random Forest TF-IDF Features: 0.9233
Accuracy of Naive Bayes classifier TF-IDF Features: 0.9134


### Apply PMI

#### Compute PMI Scores

In [17]:
from collections import Counter

word_counts = Counter()
positive_counts = Counter()
negative_counts = Counter()

for review, label in zip(reviews, labels):
    tokens = set(re.findall(r'\b\w+\b', review.lower()))
    word_counts.update(tokens)
    if label == 1:
        positive_counts.update(tokens)
    else:
        negative_counts.update(tokens)

In [18]:
total_words = sum(word_counts.values())
positive_total = sum(positive_counts.values())
negative_total = sum(negative_counts.values())

In [19]:
vocab = tfidf_vectorizer.get_feature_names_out()

In [63]:
print(len(vocab))
print(vocab[:5])

5000
['00' '000' '007' '01' '02']


In [20]:
pmi_scores = {}
for word in vocab:
    p_word = word_counts[word] / total_words
    p_word_positive = (positive_counts[word] / positive_total) if word in positive_counts else 0
    p_word_negative = (negative_counts[word] / negative_total) if word in negative_counts else 0

    if p_word_positive > 0:
        pmi_scores[word] = np.log2(p_word_positive / p_word)
    elif p_word_negative > 0:
        pmi_scores[word] = -np.log2(p_word_negative / p_word)
    else:
        pmi_scores[word] = 0

In [21]:
print(pmi_scores)



In [22]:
pmi_features = np.array([[pmi_scores.get(word, 0) for word in vocab] for review in reviews])

In [23]:
print(pmi_features.shape)

(40000, 5000)


In [24]:
X3 = np.hstack([X2, pmi_features])
# clean up negative values
X3 = np.maximum(X3, 0)
print(X3[:5])
print(X3.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(40000, 10006)


In [25]:
X3_train, X3_test, y_train, y_test = train_test_split(X3, labels, test_size=0.2, random_state=42)

In [26]:
final_models = [
    (LogisticRegression(), "Logistic Regression TF-IDF PMI Features"),
    (RandomForestClassifier(), "Random Forest TF-IDF PMI Features"),
    (MultinomialNB(), "Naive Bayes classifier TF-IDF PMI Features"),
    # (SVC(), "Support Vector Machine")
]

In [40]:
for model, model_name in final_models:
    train_and_evaluate_model(model, model_name, X3_train, X3_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of Logistic Regression TF-IDF PMI Features: 0.9250
Accuracy of Random Forest TF-IDF PMI Features: 0.9247
Accuracy of Naive Bayes classifier TF-IDF PMI Features: 0.9123


## Testing

In [28]:
def predict_review(model_path:str, review:str)->str:
    model = joblib.load(model_path)
    features = extract_features([review])
    tfidf_features = tfidf_vectorizer.transform([review]).toarray()
    pmi_features = np.array([[pmi_scores.get(word, 0) for word in vocab]])
    X = np.hstack([features, tfidf_features, pmi_features])
    y_pred = model.predict(X)
    return 'Positive' if y_pred[0] == 1 else 'Negative'


In [29]:
predict_review('Logistic Regression TF-IDF PMI Features.pkl', "I love this movie")

'Positive'

In [30]:
predict_review('Logistic Regression TF-IDF PMI Features.pkl', "I don't love this movie")

'Negative'

## Challenging

In [31]:
def challenge(model_path:str, file_path:str, output_path:str)->None:
    model = joblib.load(model_path)
    with open(file_path, 'r', encoding='utf-8') as f:
        reviews = f.readlines()
    features = extract_features(reviews)
    tfidf_features = tfidf_vectorizer.transform(reviews).toarray()
    pmi_features = np.array([[pmi_scores.get(word, 0) for word in vocab] for review in reviews])
    X = np.hstack([features, tfidf_features, pmi_features])
    y_pred = model.predict(X)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(''.join(map(str, y_pred)))

In [32]:
challenge('Logistic Regression TF-IDF PMI Features.pkl','challenge_data.txt', 'challenge_output.txt')

### Challenges 2

In [43]:
def challenge2(model_path:str, input_path:str, output_path:str)->None:
    reviews = load_file(input_path)
    default_features = extract_features(reviews)

    # tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)
    # tfidf_features = tfidf_vectorizer.fit_transform(reviews).toarray()
    tfidf_features = tfidf_vectorizer.transform(reviews).toarray()

    features = np.hstack([default_features, tfidf_features])

    model = joblib.load(model_path)

    predicts = model.predict(features)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(map(str, predicts)))

In [44]:
print(tfidf_models[0][1])

Logistic Regression TF-IDF Features


In [45]:
challenge2(f"{tfidf_models[0][1]}.pkl",'challenge_data.txt', 'challenge_output2.txt')

In [53]:
challenge_inputs = load_file('challenge_data.txt')
challenge_outputs = load_file('challenge_output2.txt')

for input, output in zip(challenge_inputs, challenge_outputs):
    with open('challenge_merged.txt', 'a', encoding='utf-8') as f:
        f.write(f"{input}{output}")

## Other Experiments

### TF-IDF Alone

In [54]:
X_tfidf = tfidf_features
X_tfidf_train, X_tfidf_test, _, _ = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

In [55]:
train_and_evaluate_model(LogisticRegression(), "Logistic TF-IDF Alone", X_tfidf_train, X_tfidf_test)

Accuracy of Logistic TF-IDF Alone: 0.9304


### PMI Features Alone

In [56]:
X_pmi = pmi_features
X_pmi_train, X_pmi_test, _, _ = train_test_split(X_pmi, labels, test_size=0.2, random_state=42)

In [57]:
train_and_evaluate_model(LogisticRegression(), "Logistic PMI Alone", X_pmi_train, X_pmi_test)

Accuracy of Logistic PMI Alone: 0.4983


### Default + PMI Features

In [58]:
X_pmi_2 = np.hstack([X, pmi_features])
X_pmi_2_train, X_pmi_2_test, _, _ = train_test_split(X_pmi_2, labels, test_size=0.2, random_state=42)

In [59]:
train_and_evaluate_model(LogisticRegression(), "Logistic Default PMI", X_pmi_2_train, X_pmi_2_test)

Accuracy of Logistic Default PMI: 0.5919
