# Text Classification (Mini Project 2)
author: Mao Hieng

In [1]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

## Load data

In [2]:
# Load files
def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

In [3]:
positive_reviews = load_file('positive-reviews.txt')
negative_reviews = load_file('negative-reviews.txt')

positive_reviews[:5], negative_reviews[:5]

(['Size, Size, ans Size.\n',
  'Excellent quality, speedy printing, low cost\n',
  'Cheap, good quality, small size\n',
  'Attractive design, satisfying features, the backlight!\n',
  'Pretty much has every feature you could possibly need, great look\n'],
 ['Image quality not as good as some other brands, cheap feel to body.\n',
  'nothing\n',
  'Black text could be better, ink runs out kind of fast\n',
  "The thing won't work\n",
  'Display grainy, small keypad for text messaging, poor vibrate function\n'])

## Creating Features

In [4]:
positive_words = set(load_file('positive-words.txt'))
negative_words = set(load_file('negative-words.txt'))

In [5]:
def extract_features(reviews, more_features=None):
    features = []
    for i, review in enumerate(reviews):
        review = review.lower()
        tokens = re.findall(r'\b\w+\b', review)

        pos_count = sum(1 for word in tokens if word in positive_words)
        neg_count  = sum(1 for word in tokens if word in negative_words)
        contains_no = int('no' in tokens)
        pronoun_count = sum(1 for word in tokens if word in ['i', 'me', 'my', 'you', 'your'])
        contains_exclamation = int('!' in review)
        log_length = np.log(len(tokens) + 1)

        if more_features is not None:
            ff = more_features[i]
            features.append([pos_count, neg_count, contains_no, pronoun_count, contains_exclamation, log_length, *ff])
        else:
            features.append([pos_count, neg_count, contains_no, pronoun_count, contains_exclamation, log_length])
    
    return features

## Preparing Data

In [6]:
positive_labels = [1] * len(positive_reviews)
negative_labels = [0] * len(negative_reviews)

reviews = positive_reviews + negative_reviews
labels = positive_labels + negative_labels

## Normal Features

In [7]:
features = extract_features(reviews)
features[:5]

[[0, 0, 0, 0, 0, 1.6094379124341003],
 [0, 0, 0, 0, 0, 1.9459101490553132],
 [0, 0, 0, 0, 0, 1.791759469228055],
 [0, 0, 0, 0, 1, 1.9459101490553132],
 [0, 0, 0, 1, 0, 2.4849066497880004]]

In [13]:
X = np.array(features)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

(40000, 6)


## Train and evaluate models

In [9]:
def train_and_evaluate_model(model, model_name, trainset, testset):
    model.fit(trainset, y_train)
    y_pred = model.predict(testset)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy of {model_name}: {accuracy:.4f}")

In [10]:
models = [
    (LogisticRegression(), "Logistic Regression"),
    (RandomForestClassifier(), "Random Forest"),
    (SVC(), "Support Vector Machine")
]

In [14]:
for model, model_name in models:
    train_and_evaluate_model(model, model_name, X_train, X_test)

Accuracy of Logistic Regression: 0.5919
Accuracy of Random Forest: 0.5946
Accuracy of Support Vector Machine: 0.5904


#### Naive Bayes 

In [16]:
from sklearn.naive_bayes import MultinomialNB
train_and_evaluate_model(MultinomialNB(), "Naive Bayes classifier", X_train, X_test)

Accuracy of Naive Bayes classifier: 0.5715


## More Features

### Apply TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)

In [18]:
print(tfidf_matrix.shape)

(40000, 9182)


In [None]:
new_features = extract_features(reviews, tfidf_matrix.toarray())

In [20]:
X2 = np.array(new_features)
print(X2.shape)
X2_train, X2_test, y_train, y_test = train_test_split(X2, labels, test_size=0.2, random_state=42)

(40000, 9188)


In [22]:
train_and_evaluate_model(LogisticRegression(), "Logistic Regression", X2_train, X2_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of Logistic Regression: 0.9296
