### Extract data from files

In [1]:
# imports and downloads
import os
import nltk
import math
import random
from nltk.corpus import stopwords
import numpy as np
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def extract_review_score(folder_name):
    documents = {}
    scores = {}
    id = 0

    for sentiment in ('pos', 'neg'):
        path = os.path.join(folder_name, sentiment)
        
        # sort filenames in order of ID
        sorted_filenames = sorted(os.listdir(path), key=lambda x: int(x.split('_')[0])) 

        # loop in order of file names
        for file_name in sorted_filenames:

            # extract the score from filename 
            # ['{ID}','{score}.txt}'] -> ['{score}', 'txt'] -> '{score}'  
            score = file_name.split('_')[1].split('.')[0]

            # read content of the file
            with open(os.path.join(path, file_name), 'r', encoding='utf-8') as file:
                line = file.read()
                documents[id] = line
                scores[id] = 1 if int(score) >= 7 else 0
                id += 1
    
    return documents, scores

# extract data from 'train' and 'test' folders
documents, doc_scores = extract_review_score('data')
doc_reviews = list(documents.values())

# print first review and its corresponding score
print(documents.get(0))
print(doc_reviews[0])
print(doc_scores.get(0))

The production quality, cast, premise, authentic New England (Waterbury, CT?) locale and lush John Williams score should have resulted in a 3-4 star collectors item. Unfortunately, all we got was a passable 2 star "decent" flick, mostly memorable for what it tried to do.........bring an art house style film mainstream. The small town locale and story of ordinary people is a genre to itself, and if well done, will satisfy most grownups. Jane Fonda was unable to hide her braininess enough to make her character believable. I wondered why she wasn't doing a post doctorate at Yale instead of working in a dead end factory job in Waterbury. Robert DiNiro's character was just a bit too contrived. An illiterate, nice guy loser who turns out to actually be, with a little help from Jane's character, a 1990 version of Henry Ford or Thomas Edison.<br /><br />This genre has been more successfully handled by "Nobody's Fool" in the mid 90s and this year's (2003) "About Schmidt." I wish that the main s

In [3]:
# both should be 4000 
print(len(documents))
print(len(doc_reviews))
print(len(doc_scores))

4000
4000
4000


### Feature generation


In [4]:
import string
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# do feature generation after the data split

stop_words = set(stopwords.words('english'))

# Features 1
# - lowercase
# - remove stopwords and punctuation
# - use lemmatization

def generate_features_1(doc_reviews, n):
    review_ngrams = {}
    lemmatizer = WordNetLemmatizer()
    for id, review in enumerate(doc_reviews):
        word_list = nltk.word_tokenize(review.lower()) 
        word_list = [lemmatizer.lemmatize(word) for word in word_list if not word in stop_words and not word in string.punctuation]

        # remove "''", 'br','``','--','...',"'s" using .replace()
        word_list = [word.replace("''", "").replace('br', "").replace('``', "").replace('--', "").replace('...', "").replace("'s", "") for word in word_list]
        # remove empty strings
        word_list = [word for word in word_list if word != ""]

        n_grams = list(nltk.ngrams(word_list, n))
        review_ngrams[id] = n_grams # add to dictionary with corresponding review ID (same numbering as the text files)

    return review_ngrams

# Features 2
# - remove stopwords and punctuation
# - use stemming
def generate_features_2(doc_reviews, n):
    review_ngrams = {}
    st = LancasterStemmer() 
    for id, review in enumerate(doc_reviews):
        word_list = nltk.word_tokenize(review) # tokenize
        word_list = [st.stem(word) for word in word_list if not word in stop_words and not word in string.punctuation] 

        word_list = [word.replace("''", "").replace('br', "").replace('``', "").replace('--', "").replace('...', "").replace("'s", "") for word in word_list]

        n_grams = list(nltk.ngrams(word_list, n))
        review_ngrams[id] = n_grams 

    return review_ngrams

# Features 3
# - only lemmatisation used
def generate_features_3(doc_reviews, n):
    review_ngrams = {}
    lemmatizer = WordNetLemmatizer()
    for id, review in enumerate(doc_reviews):
        word_list = nltk.word_tokenize(review) # tokenize
        word_list = [lemmatizer.lemmatize(word) for word in word_list] 

        word_list = [word.replace("''", "").replace('br', "").replace('``', "").replace('--', "").replace('...', "").replace("'s", "") for word in word_list]

        n_grams = list(nltk.ngrams(word_list, n))
        review_ngrams[id] = n_grams 

    return review_ngrams


n_value = 1 # change for other n-grams
doc_features = generate_features_1(doc_reviews, n_value) # we will be testing with features 1, 2 and 3. i will use one at a time for simplicity
# doc_features = generate_features_2(doc_reviews, n_value)
# doc_features = generate_features_3(doc_reviews, n_value)
print(len(doc_features))
print(doc_reviews[0])
print(doc_features.get(0))

4000
The production quality, cast, premise, authentic New England (Waterbury, CT?) locale and lush John Williams score should have resulted in a 3-4 star collectors item. Unfortunately, all we got was a passable 2 star "decent" flick, mostly memorable for what it tried to do.........bring an art house style film mainstream. The small town locale and story of ordinary people is a genre to itself, and if well done, will satisfy most grownups. Jane Fonda was unable to hide her braininess enough to make her character believable. I wondered why she wasn't doing a post doctorate at Yale instead of working in a dead end factory job in Waterbury. Robert DiNiro's character was just a bit too contrived. An illiterate, nice guy loser who turns out to actually be, with a little help from Jane's character, a 1990 version of Henry Ford or Thomas Edison.<br /><br />This genre has been more successfully handled by "Nobody's Fool" in the mid 90s and this year's (2003) "About Schmidt." I wish that the m

In [5]:
all_terms = []
for id, reviews in doc_features.items():
    for terms in reviews:
        all_terms.append(terms)

all_terms = list(set(all_terms)) # remove duplicates
print(len(all_terms))

40609


### Data splits

In [6]:
random.seed(3)

# split data into training, validation and testings
num_train_docs = int(len(documents) * 0.7 * 0.5) # ensure equal number of positive and negative reviews for training
num_validation_docs = int(len(documents) * 0.2 * 0.5)
num_test_docs = int(len(documents) * 0.1 * 0.5)

# shuffle the IDs
pos_index = list(range(len(doc_features) // 2))
neg_index = list(range(len(doc_features) // 2, len(doc_features)))

# randomize indices
random.shuffle(pos_index)
random.shuffle(neg_index)

# split the IDs
train_doc_ids = pos_index[:num_train_docs] + neg_index[:num_train_docs]
validation_doc_ids = pos_index[num_train_docs: num_train_docs + num_validation_docs] + neg_index[num_train_docs: num_train_docs + num_validation_docs]
test_doc_ids = pos_index[num_train_docs + num_validation_docs:] + neg_index[num_train_docs + num_validation_docs:]

# split the reviews
train_reviews = {id: doc_features[id] for id in train_doc_ids}
validation_reviews = {id: doc_features[id] for id in validation_doc_ids}
test_reviews = {id: doc_features[id] for id in test_doc_ids}

# split the scores
train_scores = {id: doc_scores[id] for id in train_doc_ids}
validation_scores = {id: doc_scores[id] for id in validation_doc_ids}
test_scores = {id: doc_scores[id] for id in test_doc_ids}

print(f"ratio: {sum(train_scores.values())/len(train_scores)}") 
print(f"ratio: {sum(validation_scores.values())/len(validation_scores)}")
print(f"ratio: {sum(test_scores.values())/len(test_scores)}")

print(list(train_reviews.keys())[:3])
print(documents.get(1823))
print(train_reviews.get(1823))
print(list(validation_scores.items())[:10])

print(list(train_reviews.values())[:5])

ratio: 0.5
ratio: 0.5
ratio: 0.5
[1006, 619, 20]
I believe this is the most powerful film HBO Pictures has made to date. This film should have been released in theaters for the public to view on the big screen. It is available on video so make sure you look for it and check it out. Chris Gerolmo did a great job with the direction and the screenplay. The performances from Stephen Rea, Donald Sutherland and Jeffery DeMunn are flawless. A masterpiece of the genre.
[('believe',), ('powerful',), ('film',), ('hbo',), ('picture',), ('made',), ('date',), ('film',), ('released',), ('theater',), ('public',), ('view',), ('big',), ('screen',), ('available',), ('video',), ('make',), ('sure',), ('look',), ('check',), ('chris',), ('gerolmo',), ('great',), ('job',), ('direction',), ('screenplay',), ('performance',), ('stephen',), ('rea',), ('donald',), ('sutherland',), ('jeffery',), ('demunn',), ('flawless',), ('masterpiece',), ('genre',)]
[(1671, 1), (1897, 1), (605, 1), (1101, 1), (1906, 1), (1570, 

### Feature selection

In [7]:
def calc_normalized_tf(reviews):
    tf = {} 

    for id, review in reviews.items(): # go through each review
        tf[id] = {}
        for word in review:
            if word not in tf[id]:
                tf[id][word] = 1 # if word not in dictionary, initialize to 1. 
            else:
                tf[id][word] += 1 # if word in dictionary, increment by 1

    return tf

term_freq = calc_normalized_tf(train_reviews)
print(term_freq.get(1823))

{('believe',): 1, ('powerful',): 1, ('film',): 2, ('hbo',): 1, ('picture',): 1, ('made',): 1, ('date',): 1, ('released',): 1, ('theater',): 1, ('public',): 1, ('view',): 1, ('big',): 1, ('screen',): 1, ('available',): 1, ('video',): 1, ('make',): 1, ('sure',): 1, ('look',): 1, ('check',): 1, ('chris',): 1, ('gerolmo',): 1, ('great',): 1, ('job',): 1, ('direction',): 1, ('screenplay',): 1, ('performance',): 1, ('stephen',): 1, ('rea',): 1, ('donald',): 1, ('sutherland',): 1, ('jeffery',): 1, ('demunn',): 1, ('flawless',): 1, ('masterpiece',): 1, ('genre',): 1}


In [8]:
#PPMI

#construct term co-occurence matrix
# def construct_term_to_term_matrix(reviews):
#     tot_mat = {}

#     for id, review in reviews.items():
#         words = set(review) # get unique words in the review
#         for word in words:
#             if word not in tot_mat:
#                 tot_mat[word] = {}
#             for other_word in words:
#                 if other_word != word:
#                     if other_word not in tot_mat[word]:
#                         tot_mat[word][other_word] = 1 # if the word pair does not exist, add it to the matrix
#                     else:
#                         tot_mat[word][other_word] += 1 # if the word pair exists, increment the count by 1

#     return tot_mat

# def calc_ppmi(tot_mat):
#     ppmi = {}

#     total_count = sum(sum(counts.values()) for counts in tot_mat.values())
#     word_count = {word: sum(counts.values()) for word, counts in tot_mat.items()} 

#     for word, counts in tot_mat.items():
#         ppmi[word] = {}
        
#         for other_word, count in counts.items(): # loop through each word pair
#             # calculate PPMI using the formula
#             p_x = word_count[word] / total_count 
#             p_y = word_count[other_word] / total_count
#             p_xy = count / total_count
#             ppmi[word][other_word] = max(math.log(p_xy / (p_x * p_y), 2), 0)
    
#     return ppmi

# tot_mat = construct_term_to_term_matrix(train_reviews)
# train_ppmi = calc_ppmi(tot_mat)

In [9]:
# TF-IDF

# Document frequency (how many documents contain a term)
def calc_df(reviews):
    df = {}
    for id, review in reviews.items():
        for word in set(review):
            if word not in df:
                df[word] = 1
            else:
                df[word] += 1

    return df

doc_freq = calc_df(train_reviews)
print(doc_freq.get(('film',)))

# Inverse document frequency (how important a term is)
def calc_idf(df, num_docs):
    idf = {}
    for word, term_df in df.items():
        idf[word] = math.log(num_docs / term_df, 10)

    return idf

inverse_doc_freq = calc_idf(doc_freq, len(train_reviews))
print(inverse_doc_freq.get(('film',)))

# TF-IDF (term frequency - inverse document frequency)
def calc_tfidf(tf, idf):
    tfidf = {}
    for id, review in tf.items():
        tfidf[id] = {}
        for word, freq in review.items():
            tfidf[id][word] = freq * idf[word]

    return tfidf

train_tfidf = calc_tfidf(term_freq, inverse_doc_freq)

# print first review and its corresponding TF-IDF
print(doc_reviews[1823])
print(train_tfidf.get(1823))


1691
0.21901442374447744
I believe this is the most powerful film HBO Pictures has made to date. This film should have been released in theaters for the public to view on the big screen. It is available on video so make sure you look for it and check it out. Chris Gerolmo did a great job with the direction and the screenplay. The performances from Stephen Rea, Donald Sutherland and Jeffery DeMunn are flawless. A masterpiece of the genre.
{('believe',): 1.0272022828524614, ('powerful',): 1.6276140958003502, ('film',): 0.4380288474889549, ('hbo',): 2.3679767852945943, ('picture',): 1.1396619934290062, ('made',): 0.6152882570617174, ('date',): 1.6912831756697277, ('released',): 1.4603862970759742, ('theater',): 1.4740301777425204, ('public',): 1.6837300377792819, ('view',): 1.4259687322722812, ('big',): 0.9420080530223132, ('screen',): 1.0724096853321152, ('available',): 1.9030899869919433, ('video',): 1.3298867356864548, ('make',): 0.4475925431162368, ('sure',): 1.045757490560675, ('look

### Naïve Bayes

In [10]:
class NaiveBayesClassifier:
    def __init__(self):
        self.positive = {} # P(feature | positive)
        self.negative = {} # P(feature | negative)
        self.positive_prior = 0 # P(positive)
        self.negative_prior = 0 # P(negative)  

    def train_tfidf(self, train_tfidf, train_scores):
        # calculate P("positive") and P("negative"), P(content | "positive"), P(content | "negative")
        pos_id = []
        neg_id = []

        for id in train_tfidf.keys(): # separate positive and negative IDs
            if train_scores.get(id) == 1:
                pos_id.append(id)
            else:
                neg_id.append(id)     
        
        self.positive_prior = len(pos_id) / len(train_tfidf.keys()) # P(positive)
        self.negative_prior = len(neg_id) / len(train_tfidf.keys()) # P(negative)

        positive_total_tfidf = 0 
        negative_total_tfidf = 0

        for id, sub_dict in train_tfidf.items():
            for term, tfidf_value in sub_dict.items():
                if id in pos_id:
                    positive_total_tfidf += tfidf_value
                    if term not in self.positive:
                        self.positive[term] = tfidf_value
                    else:
                        self.positive[term] += tfidf_value
                else:
                    negative_total_tfidf += tfidf_value
                    if term not in self.negative:
                        self.negative[term] = tfidf_value
                    else:
                        self.negative[term] += tfidf_value
                

        # divide by total tf-idf value for each class
        for term in self.positive.keys():
            self.positive[term] /= positive_total_tfidf
        for term in self.negative.keys():
            self.negative[term] /= negative_total_tfidf

    
    def train_ppmi(self, train_ppmi, train_scores, train_reviews):
        pos_id = []
        neg_id = []

        for id in train_reviews.keys():
            if train_scores.get(id) == 1:
                pos_id.append(id)
            else:
                neg_id.append(id)     
        
        self.positive_prior = len(pos_id) / len(train_reviews.keys())
        self.negative_prior = len(neg_id) / len(train_reviews.keys())

        # initialize all values to 0
        self.positive = {word: 0 for word in train_ppmi} 
        self.negative = {word: 0 for word in train_ppmi}

        for id in pos_id: # for each positive review 
            for word in train_reviews[id]:  
                if word in self.positive:
                    # sum up the PPMI values for each existing word pair in the review
                    self.positive[word] += sum(train_ppmi[word].get(other_word, 0) for other_word in train_reviews[id] if other_word in train_ppmi[word])

        for id in neg_id:
            for word in train_reviews[id]:
                if word in self.negative:
                    self.negative[word] += sum(train_ppmi[word].get(other_word, 0) for other_word in train_reviews[id] if other_word in train_ppmi[word])
        
        total_positive = sum(self.positive.values())
        total_negative = sum(self.negative.values())

        # divide by total PPMI value for each class
        for word in self.positive.keys():
            self.positive[word] /= total_positive
        for word in self.negative.keys():
            self.negative[word] /= total_negative

        # set all 0 values to 1e-10 to avoid log(0)
        for word in self.positive.keys():
            if self.positive[word] <= 0:
                self.positive[word] = 1e-10
        
        for word in self.negative.keys():
            if self.negative[word] <= 0:
                self.negative[word] = 1e-10
        
                
    def predict(self, reviews):
        classification = {}

        for id, review in reviews.items():
            p_positive = math.log(self.positive_prior)
            p_negative = math.log(self.negative_prior)

            for term in review:
                if term in self.positive:
                    p_positive += math.log(self.positive[term])
                else:  # Laplace smoothing for positive
                    p_positive += math.log(1 / (sum(self.positive.values()) + len(self.positive)))

                if term in self.negative:
                    p_negative += math.log(self.negative[term])
                else:  # Laplace smoothing for negative
                    p_negative += math.log(1 / (sum(self.negative.values()) + len(self.negative)))

            classification[id] = 1 if p_positive > p_negative else 0

        return classification
    

    def evaluate(self, classification, scores):
        correct = 0

        for id, score in scores.items():
            if classification[id] == score:
                correct += 1

        return correct / len(scores.keys())


classifier_tfidf = NaiveBayesClassifier()
classifier_tfidf.train_tfidf(train_tfidf, train_scores)
classification_tfidf = classifier_tfidf.predict(validation_reviews)
accuracy_tfidf = classifier_tfidf.evaluate(classification_tfidf, validation_scores)
print(f'accuracy for validation set: {accuracy_tfidf}')

# predict test set
classification_tfidf = classifier_tfidf.predict(test_reviews)
accuracy_tfidf = classifier_tfidf.evaluate(classification_tfidf, test_scores)
print(f'accuracy for test set: {accuracy_tfidf}')

# classifier_ppmi = NaiveBayesClassifier()
# classifier_ppmi.train_ppmi(train_ppmi, train_scores, train_reviews)
# classification_ppmi = classifier_ppmi.predict(validation_reviews)
# accuracy_ppmi = classifier_ppmi.evaluate(classification_ppmi, validation_scores)
# print(f'accuracy for validation set: {accuracy_ppmi}')

accuracy for validation set: 0.775
accuracy for test set: 0.7825


### sci-kit naive bayes model

In [11]:
# create sparse matrix (column of all terms, row of all reviews) 

# all_terms contain all the terms in the entire dataset, we want a sparse matrix so we need to initialize each row's column with all the terms

# review is a dictionary of {id : {term : tfidf}}
def create_sparse_matrix(all_terms, reviews):
    sparse_matrix = {}

    for id, review in reviews.items():
        sparse_matrix[id] = {}
        for term in all_terms:
            sparse_matrix[id][term] = 0

    # fill in the existing tfidf values
    for id, review in reviews.items():
        for term, tfidf in review.items():
            sparse_matrix[id][term] = tfidf

    return sparse_matrix


# get TF-IDF values for validation and test sets
test_doc_freq = calc_df(test_reviews)
test_inverse_doc_freq = calc_idf(test_doc_freq, len(test_reviews))
test_tfidf = calc_tfidf(calc_normalized_tf(test_reviews), test_inverse_doc_freq)

validation_doc_freq = calc_df(validation_reviews)
validation_inverse_doc_freq = calc_idf(validation_doc_freq, len(validation_reviews))
validation_tfidf = calc_tfidf(calc_normalized_tf(validation_reviews), validation_inverse_doc_freq)

# create sparse matrix
train_sparse_matrix = create_sparse_matrix(all_terms, train_tfidf)
validation_sparse_matrix = create_sparse_matrix(all_terms, validation_tfidf)
test_sparse_matrix = create_sparse_matrix(all_terms, test_tfidf)

# convert to numpy array
train_sparse_matrix = np.array([list(review.values()) for review in train_sparse_matrix.values()])
validation_sparse_matrix = np.array([list(review.values()) for review in validation_sparse_matrix.values()])
test_sparse_matrix = np.array([list(review.values()) for review in test_sparse_matrix.values()])


In [12]:
from sklearn.naive_bayes import MultinomialNB

train_scores = list(train_scores.values())
validation_scores = list(validation_scores.values())
test_scores = list(test_scores.values())

# train sklearn model
multinomial_nb = MultinomialNB()
multinomial_nb.fit(train_sparse_matrix, train_scores)


# prediction for validation set
sklearn_classification = multinomial_nb.predict(validation_sparse_matrix)
sklearn_accuracy = multinomial_nb.score(validation_sparse_matrix, validation_scores)
print(f'accuracy for MultinomialNB on validation set: {sklearn_accuracy}')


# prediction for test set
sklearn_classification = multinomial_nb.predict(test_sparse_matrix)
sklearn_accuracy = multinomial_nb.score(test_sparse_matrix, test_scores)
print(f'accuracy for MultinomialNB on test set: {sklearn_accuracy}') 


accuracy for MultinomialNB on validation set: 0.79625
accuracy for MultinomialNB on test set: 0.8


In [13]:
# logistic regression
from sklearn.linear_model import LogisticRegression

# logistic_regression = LogisticRegression(penalty='none', fit_intercept=True, tol=1e-8)
logistic_regression = LogisticRegression()
logistic_regression.fit(train_sparse_matrix, train_scores)

logistic_regression_2 = LogisticRegression(penalty='none', fit_intercept=True, tol=1e-8)
logistic_regression_2.fit(train_sparse_matrix, train_scores)

logistic_regression_3 = LogisticRegression(penalty='none', fit_intercept=False, tol=1e-8)
logistic_regression_3.fit(train_sparse_matrix, train_scores)

logistic_regression_4 = LogisticRegression(penalty='l2', fit_intercept=False, tol=1e-8)
logistic_regression_4.fit(train_sparse_matrix, train_scores)

logistic_regression_5 = LogisticRegression(penalty='l2', fit_intercept=True, tol=1e-8)
logistic_regression_5.fit(train_sparse_matrix, train_scores)


# prediction for validtion set
# logistic_regression_classification = logistic_regression.predict(validation_sparse_matrix)
# logistic_regression_accuracy = logistic_regression.score(validation_sparse_matrix, validation_scores)
# print(f'accuracy for logistic regression on validation set: {logistic_regression_accuracy}')

logistic_regression_classification = logistic_regression_2.predict(validation_sparse_matrix)
logistic_regression_accuracy = logistic_regression_2.score(validation_sparse_matrix, validation_scores)
print(f'accuracy for logistic regression 2 on validation set: {logistic_regression_accuracy}')

# logistic_regression_classification = logistic_regression_3.predict(validation_sparse_matrix)
# logistic_regression_accuracy = logistic_regression_3.score(validation_sparse_matrix, validation_scores)
# print(f'accuracy for logistic regression 3 on validation set: {logistic_regression_accuracy}')

# logistic_regression_classification = logistic_regression_4.predict(validation_sparse_matrix)
# logistic_regression_accuracy = logistic_regression_4.score(validation_sparse_matrix, validation_scores)
# print(f'accuracy for logistic regression 4 on validation set: {logistic_regression_accuracy}')

# logistic_regression_classification = logistic_regression_5.predict(validation_sparse_matrix)
# logistic_regression_accuracy = logistic_regression_5.score(validation_sparse_matrix, validation_scores)
# print(f'accuracy for logistic regression 5 on validation set: {logistic_regression_accuracy}')



# prediction for test set
# logistic_regression_classification = logistic_regression.predict(test_sparse_matrix)
# logistic_regression_accuracy = logistic_regression.score(test_sparse_matrix, test_scores)
# print(f'accuracy for logistic regression on test set: {logistic_regression_accuracy}') 

logistic_regression_classification = logistic_regression_2.predict(test_sparse_matrix)
logistic_regression_accuracy = logistic_regression_2.score(test_sparse_matrix, test_scores)
print(f'accuracy for logistic regression 2 on test set: {logistic_regression_accuracy}')

accuracy for logistic regression 2 on validation set: 0.845
accuracy for logistic regression 2 on test set: 0.875


In [14]:
# svm
from sklearn import svm

# linear svc
linear_svc = svm.LinearSVC(max_iter=10000)
linear_svc.fit(train_sparse_matrix, train_scores)

linear_svc_2 = svm.LinearSVC(max_iter=10000, penalty='l2', fit_intercept=True, dual=True)
linear_svc_2.fit(train_sparse_matrix, train_scores)

linear_svc_3 = svm.LinearSVC(max_iter=10000, penalty='l1', dual=False)
linear_svc_3.fit(train_sparse_matrix, train_scores)

linear_svc_4 = svm.LinearSVC(max_iter=10000, dual=True, fit_intercept=False)
linear_svc_4.fit(train_sparse_matrix, train_scores)

linear_svc_5 = svm.LinearSVC(max_iter=10000, penalty='l2', dual=False)
linear_svc_5.fit(train_sparse_matrix, train_scores)

# prediction for validation set
linear_svc_classification = linear_svc.predict(validation_sparse_matrix)
linear_svc_accuracy = linear_svc.score(validation_sparse_matrix, validation_scores)
print(f'accuracy for linear svc on validation set: {linear_svc_accuracy}')

linear_svc_classification = linear_svc_2.predict(validation_sparse_matrix)
linear_svc_accuracy = linear_svc_2.score(validation_sparse_matrix, validation_scores)
print(f'accuracy for linear svc 2 on validation set: {linear_svc_accuracy}')

linear_svc_classification = linear_svc_3.predict(validation_sparse_matrix)
linear_svc_accuracy = linear_svc_3.score(validation_sparse_matrix, validation_scores)
print(f'accuracy for linear svc 3 on validation set: {linear_svc_accuracy}')

linear_svc_classification = linear_svc_4.predict(validation_sparse_matrix)
linear_svc_accuracy = linear_svc_4.score(validation_sparse_matrix, validation_scores)
print(f'accuracy for linear svc 4 on validation set: {linear_svc_accuracy}')

linear_svc_classification = linear_svc_5.predict(validation_sparse_matrix)
linear_svc_accuracy = linear_svc_5.score(validation_sparse_matrix, validation_scores)
print(f'accuracy for linear svc 5 on validation set: {linear_svc_accuracy}')

# prediction for test set
# linear_svc_classification = linear_svc.predict(test_sparse_matrix)
# linear_svc_accuracy = linear_svc.score(test_sparse_matrix, test_scores)
# print(f'accuracy for linear svc on test set: {linear_svc_accuracy}') 

accuracy for linear svc on validation set: 0.835
accuracy for linear svc 2 on validation set: 0.835
accuracy for linear svc 3 on validation set: 0.82
accuracy for linear svc 4 on validation set: 0.83625
accuracy for linear svc 5 on validation set: 0.83125
