In [1]:
import math
import os
import re
import nltk
import numpy as np

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk import *
from nltk.stem.porter import *
from nltk import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import string
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Path to the folder
data_path = '/content/drive/MyDrive/data4'
data = os.listdir(data_path)

positive_reviews = os.listdir('/content/drive/MyDrive/data4/pos')
negative_reviews = os.listdir('/content/drive/MyDrive/data4/neg')

print(positive_reviews[:5])
print(len(positive_reviews))
print(negative_reviews[:5])
print(len(negative_reviews))

['4731_8.txt', '3264_8.txt', '7935_8.txt', '7447_8.txt', '9327_8.txt']
2000
['9485_3.txt', '6435_3.txt', '9565_3.txt', '5093_3.txt', '4392_3.txt']
2000


In [4]:
def load_data(path, label):
    data = []

    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)

        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

            # Extracting star rating using regular expression
            match = re.search(r'_(\d+)\.txt$', filename)
            star_rating = int(match.group(1)) if match else 0

            data.append((content, label, star_rating))

    return data

# Load positive reviews
positive_reviews = load_data('/content/drive/MyDrive/data4/pos', 'positive')
print(positive_reviews[:20])
print(len(positive_reviews))

# Load negative reviews
negative_reviews = load_data('/content/drive/MyDrive/data4/neg', 'negative')
print(len(negative_reviews))


[("This is definitely one of Jet's best efforts. Few actors are able to play the stoic as Jet Li can. The action is rapid-fire, and special-effects boosted for intensity purposes. As a result, it may take Americans a little off-guard. A little suspension of disbelief goes a long way in a Jet Li film. I feel that it is an excellent introduction to Jet's work and look forward to further masterpieces (especially Fist of Legend) making it into the US market. A nice mixture of gunplay and physical conflict will satisfy most action flick enthusiasts.", 'positive', 8), ('What a loss the passing of director Emile Ardolino was! He could take a light script and, with the right casting and editing, put a twinkle in it and make it shine like a star. This particular star may not be the brightest in the sky as great romances go, but it is definitely one that keeps you tuned in to the end. You really want to know how things are going to work out.<br /><br />The script is perfect for Cybill Shepherd, 

In [5]:
from sklearn.model_selection import train_test_split

TotalDataset = positive_reviews + negative_reviews

data = [review[0] for review in TotalDataset]
labels = [label[1] for label in TotalDataset]

#Spillting the data into 80-10-10
X_train, X_temp, y_train, y_temp = train_test_split(data, labels, test_size=0.2, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [6]:
#Combination 1
def process_one(data_list):
  stoplist = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()
  tokenized_list = []
  for content in data_list:
    #list comprehension
    word_list = [lemmatizer.lemmatize(word) for word in word_tokenize(content.lower())
                 if not word in stoplist]
    tokenized_list.append(word_list)
  return tokenized_list

training_feature_selection_one = process_one(X_train)
dev_feature_selection_one = process_one(X_dev)
test_feature_selection_one = process_one(X_test)

In [7]:
#Combination two
def process_two(data_list):
    stoplist = set(stopwords.words('english'))
    st = PorterStemmer()
    tokenized_list = []
    for content in data_list:
        word_list = [st.stem(re.sub(r'[^a-zA-Z0-9]', '', word)) for word in word_tokenize(content.lower()) if not word in stoplist and not re.match(r'[^a-zA-Z0-9]+', word)]
        tokenized_list.append(word_list)

    return tokenized_list

training_feature_selection_two = process_two(X_train)
dev_feature_selection_two = process_two(X_dev)
test_feature_selection_two = process_two(X_test)

In [None]:
#Combination three
def process_three(data_list):
    tokenized_list = []
    st = PorterStemmer()
    for content in data_list:
        word_list = [st.stem(re.sub(r'[^a-zA-Z0-9]', '', word)) for word in word_tokenize(content.lower()) if not re.search(r'[^a-zA-Z0-9]+', word)]
        tokenized_list.append(word_list)
    return tokenized_list

training_feature_selection_three = process_three(X_train)
dev_feature_selection_three = process_three(X_dev)
test_feature_selection_three = process_three(X_test)

In [9]:
def calculate_ngrams(data_list):
    ngrams_list = []
    for content in data_list:
        n_grams = list(ngrams(content,2))
        grams_string = [' '.join(gram) for gram in n_grams]
        ngrams_list.append(grams_string)
    return ngrams_list

#Process 1
train_ngrams_one = calculate_ngrams(training_feature_selection_one)
dev_ngrams_one = calculate_ngrams(dev_feature_selection_one)
test_ngrams_one = calculate_ngrams(test_feature_selection_one)

#Process 2
train_ngrams_two = calculate_ngrams(training_feature_selection_two)
dev_ngrams_two = calculate_ngrams(dev_feature_selection_two)
test_ngrams_two = calculate_ngrams(test_feature_selection_two)

#Process 3
train_ngrams_three = calculate_ngrams(training_feature_selection_three)
dev_ngrams_three = calculate_ngrams(dev_feature_selection_three)
test_ngrams_three = calculate_ngrams(test_feature_selection_three)

In [10]:
# Trivial Normalisation
def trivial_normalisation(ngrams_list):
    trivial_list = []
    for doc in ngrams_list:
        frequencies = Counter(doc)
        tf_dict = {gram: frequencies[gram]/ len(doc) for gram in doc}
        trivial_list.append(tf_dict)
    return trivial_list

train_values_one = trivial_normalisation(train_ngrams_one)
dev_values_one = trivial_normalisation(dev_ngrams_one)
test_values_one = trivial_normalisation(test_ngrams_one)

train_values_two = trivial_normalisation(train_ngrams_one)
dev_values_two = trivial_normalisation(dev_ngrams_two)
test_values_three = trivial_normalisation(test_ngrams_three)

train_values_one = trivial_normalisation(train_ngrams_one)
dev_values_two = trivial_normalisation(dev_ngrams_two)
test_values_three = trivial_normalisation(test_ngrams_three)


In [11]:
def calculate_all_tfidfs(ngrams_list):
    def calculate_tfidf(gram, doc, doc_frequencies, num_docs):
        frequencies = Counter(doc)
        tf = frequencies[gram] / len(doc)
        idf = np.log(num_docs / (doc_frequencies[gram] + 1))
        return tf * idf

    num_docs = len(ngrams_list)
    doc_frequencies = Counter(gram for doc in ngrams_list for gram in set(doc))

    allTfIDfs = []

    for doc in ngrams_list:
        tfIdfs = {gram: calculate_tfidf(gram, doc, doc_frequencies, num_docs) for gram in doc}
        allTfIDfs.append(tfIdfs)

    return allTfIDfs

#Process one
train_tfidf_values_one = calculate_all_tfidfs(train_ngrams_one)
dev_tfidf_values_one = calculate_all_tfidfs(dev_ngrams_one)
test_tfidf_values_one = calculate_all_tfidfs(test_ngrams_one)

#Process two
train_tfidf_values_two = calculate_all_tfidfs(train_ngrams_two)
dev_tfidf_values_two = calculate_all_tfidfs(dev_ngrams_two)
test_tfidf_values_two = calculate_all_tfidfs(test_ngrams_two)

#Process three
train_tfidf_values_three = calculate_all_tfidfs(train_ngrams_three)
dev_tfidf_values_three = calculate_all_tfidfs(dev_ngrams_three)
test_tfidf_values_three = calculate_all_tfidfs(test_ngrams_three)

In [12]:
def extract_unique_features(tfidf_values_list):
    unique_features = set()

    for tfidf_values in tfidf_values_list:
        unique_features.update(tfidf_values.keys())

    return list(unique_features)

# Extract unique features from the TF-IDF values of the training set
unique_features_one = extract_unique_features(train_tfidf_values_one)
unique_features_two = extract_unique_features(train_tfidf_values_two)
unique_features_three = extract_unique_features(train_tfidf_values_three)

In [None]:
import numpy as np

def create_tfidf_matrix(tfidf_values_list, unique_features):
    num_docs = len(tfidf_values_list)
    num_features = len(unique_features)
    matrix = np.zeros((num_docs, num_features))

    for i, tfidf_values in enumerate(tfidf_values_list):
        matrix[i] = np.array([tfidf_values.get(feature, 0) for feature in unique_features])

    return matrix

#Process one
train_tfidf_matrix_one = create_tfidf_matrix(train_tfidf_values_one, unique_features_one)
dev_tfidf_matrix_one = create_tfidf_matrix(dev_tfidf_values_one, unique_features_one)
test_tfidf_matrix_one = create_tfidf_matrix(test_tfidf_values_one, unique_features_one)

#Process two
train_tfidf_matrix_two = create_tfidf_matrix(train_tfidf_values_two, unique_features_two)
dev_tfidf_matrix_two = create_tfidf_matrix(dev_tfidf_values_two, unique_features_two)
test_tfidf_matrix_two = create_tfidf_matrix(test_tfidf_values_two, unique_features_two)

#Process three
train_tfidf_matrix_three = create_tfidf_matrix(train_tfidf_values_three, unique_features_three)
dev_tfidf_matrix_three = create_tfidf_matrix(dev_tfidf_values_three, unique_features_three)
test_tfidf_matrix_three = create_tfidf_matrix(test_tfidf_values_three, unique_features_three)

In [None]:
print(train_tfidf_matrix_one.shape)
print(dev_tfidf_matrix_one.shape)
print(test_tfidf_matrix_one.shape)

print(train_tfidf_matrix_two.shape)
print(dev_tfidf_matrix_two.shape)
print(test_tfidf_matrix_two.shape)

print(train_tfidf_matrix_three.shape)
print(dev_tfidf_matrix_three.shape)
print(test_tfidf_matrix_three.shape)

In [None]:
#Process one
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Create a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(train_tfidf_matrix_one, y_train)
#Evaluating on the development matrix
y_pred_one = nb_classifier.predict(dev_tfidf_matrix_one)
accuracy_one = accuracy_score(y_dev, y_pred_one)
print(f"Combination one Accuracy: {accuracy_one}")
# ClassificationReport_one = classification_report(y_dev, y_pred_one)
# print(f"\nCombination one Classification Report\n: {ClassificationReport_one}")

#Evaluate on the test set
test_y_pred_one = nb_classifier.predict(test_tfidf_matrix_one)
test_accuracy_one = accuracy_score(y_test, test_y_pred_one)
print(f"Combination one Test Accuracy: {test_accuracy_one}")
# ClassificationReport_one = classification_report(y_test, test_y_pred_one)
# print(f"\nCombination one Classification Report\n: {ClassificationReport_one}")

#Process two
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb_classifier = MultinomialNB()

nb_classifier.fit(train_tfidf_matrix_two, y_train)
y_pred_two = nb_classifier.predict(dev_tfidf_matrix_two)

accuracy_two = accuracy_score(y_dev, y_pred_two)
print(f"Combination two Accuracy: {accuracy_two}")
# ClassificationReport_two = classification_report(y_dev, y_pred_two)
# print(f"\nCombination two Classification Report\n: {ClassificationReport_two}")

#Evaluate on the test set
test_y_pred_two = nb_classifier.predict(test_tfidf_matrix_two)
test_accuracy_two = accuracy_score(y_test, test_y_pred_two)
print(f"Combination two Test Accuracy: {test_accuracy_two}")
# ClassificationReport_two = classification_report(y_test, test_y_pred_two)
# print(f"\nCombination two Classification Report\n: {ClassificationReport_two}")

#Process three
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb_classifier = MultinomialNB()

nb_classifier.fit(train_tfidf_matrix_three, y_train)
y_pred_three = nb_classifier.predict(dev_tfidf_matrix_three)

accuracy_three = accuracy_score(y_dev, y_pred_three)
ClassificationReport_three = classification_report(y_dev, y_pred_three)
print(f"Combination three Accuracy: {accuracy_three}")
# ClassificationReport_three = classification_report(y_dev, y_pred_three)
# print(f"\nCombination three Classification Report\n: {ClassificationReport_three}")

#Evaluate on the test set
test_y_pred_three = nb_classifier.predict(test_tfidf_matrix_three)
test_accuracy_three = accuracy_score(y_test, test_y_pred_three)
print(f"Combination three Test Accuracy: {test_accuracy_two}")
# ClassificationReport_two = classification_report(y_test, test_y_pred_three)
# print(f"\nCombination three Classification Report\n: {ClassificationReport_three}")



In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from collections import defaultdict

def calculate_prior(y_train):
    class_counts = defaultdict(int)
    for label in y_train:
        class_counts[label] += 1
    total_samples = len(y_train)
    class_probabilities = {label: count / total_samples for label, count in class_counts.items()}
    return class_probabilities

def calculate_likelihood(tfidf_matrix, y_train):
    class_counts = defaultdict(int)
    feature_counts = defaultdict(lambda: defaultdict(float))

    for i, label in enumerate(y_train):
        class_counts[label] += 1
        for j, value in enumerate(tfidf_matrix[i]):
            feature_counts[label][j] += value

    likelihoods = defaultdict(dict)
    for label in class_counts:
        total_samples_in_class = class_counts[label]
        likelihoods[label] = {feature: count / total_samples_in_class for feature, count in feature_counts[label].items()}

    return likelihoods

def predict(tfidf_matrix, prior_probabilities, likelihoods):
    predictions = []

    # Precompute normalized likelihoods for each label
    normalized_likelihoods = {label: {feature: count / sum(likelihoods[label].values()) for feature, count in likelihoods[label].items()} for label in prior_probabilities}

    for sample in tfidf_matrix:
        max_prob = float('-inf')
        predicted_label = None

        for label in prior_probabilities:
            log_prob = sum([sample[feature] * normalized_likelihoods[label].get(feature, 0) for feature in range(len(sample))])
            log_prob += prior_probabilities[label]

            if log_prob > max_prob:
                max_prob = log_prob
                predicted_label = label

        predictions.append(predicted_label)

    return predictions

def calculate_accuracy(y_true, y_pred):
    return accuracy_score(y_true, y_pred)


prior_probabilities = calculate_prior(y_train)

likelihoods_one = calculate_likelihood(train_tfidf_matrix_one, y_train)
predictions_one = predict(dev_tfidf_matrix_one, prior_probabilities, likelihoods_one)
accuracy_one = calculate_accuracy(y_dev, predictions_one)
print("Accuracy for Combination one:", accuracy_one)
ClassificationReport_one = classification_report(y_dev, predictions_one)
print("\nClassification Report for Combination one\n:", ClassificationReport_one)

likelihoods_two = calculate_likelihood(train_tfidf_matrix_two, y_train)
predictions_two = predict(dev_tfidf_matrix_two, prior_probabilities, likelihoods_two)
accuracy_two = calculate_accuracy(y_dev, predictions_two)
print("Accuracy for Combination two:", accuracy_two)
ClassificationReport_two = classification_report(y_dev, predictions_two)
print("\nClassification Report for Combination two\n:", ClassificationReport_two)

likelihoods_three = calculate_likelihood(train_tfidf_matrix_three, y_train)
predictions_three = predict(dev_tfidf_matrix_three, prior_probabilities, likelihoods_three)
accuracy_three = calculate_accuracy(y_dev, predictions_three)
print("Accuracy for Combination three:", accuracy_three)

ClassificationReport_one = classification_report(y_dev, predictions_one)
print("\nClassification Report for Combination three\n:", ClassificationReport_one)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Combination one
model = LogisticRegression()
model.fit(train_tfidf_matrix_one, y_train)

# Evaluate on the development set
dev_predictions_one = model.predict(dev_tfidf_matrix_one)
logistic_regresion_accuracy_one = accuracy_score(y_dev, dev_predictions_one)
print(f"Logistic Regression Accuracy for Combination one: {logistic_regresion_accuracy_one}")

ClassificationReport_one = classification_report(y_dev, dev_predictions_one)
print(f"\nCombination one Classification Report\n: {ClassificationReport_one}")

#Evaluate on Test set
test_predictions_one = model.predict(test_tfidf_matrix_one)
logistic_regresion_test_accuracy_one = accuracy_score(y_test, test_predictions_one)
print(f"Logistic Regression Accuracy for Combination one: {logistic_regresion_test_accuracy_one}")

ClassificationReport_test_one = classification_report(y_test, test_predictions_one)
print(f"\nCombination one Classification Report\n: {ClassificationReport_test_one}")

#Combination two
model = LogisticRegression()
model.fit(train_tfidf_matrix_two, y_train)

# Evaluate on the development set
dev_predictions_two = model.predict(dev_tfidf_matrix_two)
logistic_regresion_accuracy_two = accuracy_score(y_dev, dev_predictions_two)
print(f"Logistic Regression Accuracy for Combination two: {logistic_regresion_accuracy_two}")

ClassificationReport_two = classification_report(y_dev, dev_predictions_two)
print(f"\nCombination one Classification Report\n: {ClassificationReport_two}")

#Evaluate on Test set
test_predictions_two = model.predict(test_tfidf_matrix_two)
logistic_regresion_test_accuracy_two = accuracy_score(y_test, test_predictions_two)
print(f"Logistic Regression Accuracy for Combination one: {logistic_regresion_test_accuracy_two}")

ClassificationReport_test_two = classification_report(y_test, test_predictions_two)
print(f"\nCombination one Classification Report\n: {ClassificationReport_test_two}")

#Combination three
model = LogisticRegression()
model.fit(train_tfidf_matrix_three, y_train)

# Evaluate on the development set
dev_predictions_three = model.predict(dev_tfidf_matrix_three)
logistic_regresion_accuracy_three = accuracy_score(y_dev, dev_predictions_three)
print(f"Logistic Regression Accuracy for Combination three: {logistic_regresion_accuracy_three}")

ClassificationReport_three = classification_report(y_dev, dev_predictions_three)
print(f"\nCombination one Classification Report\n: {ClassificationReport_three}")

#Evaluate on Test set
test_predictions_three = model.predict(test_tfidf_matrix_three)
logistic_regresion_test_accuracy_three = accuracy_score(y_test, test_predictions_three)
print(f"Logistic Regression Accuracy for Combination one: {logistic_regresion_test_accuracy_three}")

ClassificationReport_test_three = classification_report(y_test, test_predictions_three)
print(f"\nCombination one Classification Report\n: {ClassificationReport_test_three}")

In [None]:
from sklearn.svm import SVC

#Combination one
model = SVC()
model.fit(train_tfidf_matrix_one, y_train)
# Evaluate on the development set
dev_predictions_one = model.predict(dev_tfidf_matrix_one)
SVM_accuracy_one = accuracy_score(y_dev, dev_predictions_one)
print(f"SVM Accuracy for Combination one: {SVM_accuracy_one}")

ClassificationReport_one = classification_report(y_dev, dev_predictions_one)
print(f"\nCombination one Classification Report\n: {ClassificationReport_one}")

#Evaluate on Test set
test_predictions_one = model.predict(test_tfidf_matrix_one)
logistic_regresion_test_accuracy_one = accuracy_score(y_test, test_predictions_one)
print(f"SVM Accuracy for Combination one: {logistic_regresion_test_accuracy_one}")

ClassificationReport_test_one = classification_report(y_test, test_predictions_one)
print(f"\nCombination one Classification Report\n: {ClassificationReport_test_one}")

#Combination two
model = SVC()
model.fit(train_tfidf_matrix_two, y_train)
dev_predictions_two = model.predict(dev_tfidf_matrix_two)
SVM_accuracy_two = accuracy_score(y_dev, dev_predictions_two)
print(f"SVM Accuracy for Combination two: {SVM_accuracy_two}")

ClassificationReport_two = classification_report(y_dev, dev_predictions_two)
print(f"\nCombination two Classification Report\n: {ClassificationReport_two}")

#Evaluate on Test set
test_predictions_two = model.predict(test_tfidf_matrix_two)
logistic_regresion_test_accuracy_two = accuracy_score(y_test, test_predictions_two)
print(f"SVM Regression Test Accuracy for Combination two: {logistic_regresion_test_accuracy_two}")

ClassificationReport_test_two = classification_report(y_test, test_predictions_two)
print(f"\nCombination two Classification Report\n: {ClassificationReport_test_two}")

#Combination three
model = SVC()
model.fit(train_tfidf_matrix_three, y_train)
dev_predictions_three = model.predict(dev_tfidf_matrix_three)
SVM_accuracy_three = accuracy_score(y_dev, dev_predictions_three)
print(f"SVM Accuracy for Combination three: {SVM_accuracy_three}")

ClassificationReport_three = classification_report(y_dev, dev_predictions_three)
print(f"\nCombination three Classification Report\n: {ClassificationReport_three}")

#Evaluate on Test set
test_predictions_three = model.predict(test_tfidf_matrix_three)
logistic_regresion_test_accuracy_three = accuracy_score(y_test, test_predictions_three)
print(f"Logistic Regression Test Accuracy for Combination three: {logistic_regresion_test_accuracy_three}")

ClassificationReport_test_three = classification_report(y_test, test_predictions_three)
print(f"\nCombination three Classification Report\n: {ClassificationReport_test_three}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import random
from sklearn.svm import SVC

def logistic_regression_hyperparameter(train_matrix, dev_matrix, test_matrix, y_train, y_dev, y_test):
    # Define hyperparameter grid
    hyperparameters = {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['lbfgs'],
        'max_iter': [100, 200, 300]  # Add a range of max_iter values
    }

    best_accuracy = 0
    best_hyperparameters = None

    # Try 5 combinations
    for _ in range(5):
        # Randomly select hyperparameters
        current_hyperparameters = {
            'C': random.choice(hyperparameters['C']),
            'penalty': random.choice(hyperparameters['penalty']),
            'solver': random.choice(hyperparameters['solver']),
            'max_iter': random.choice(hyperparameters['max_iter'])
        }

        # Create and train logistic regression model
        model = LogisticRegression(**current_hyperparameters)
        model.fit(train_matrix, y_train)

        # Evaluate on dev matrix
        y_dev_pred = model.predict(dev_matrix)
        dev_accuracy = accuracy_score(y_dev, y_dev_pred)

        # Check if current combination is the best
        if dev_accuracy > best_accuracy:
            best_accuracy = dev_accuracy
            best_hyperparameters = current_hyperparameters

    # Apply best hyperparameters on test set
    best_model = LogisticRegression(**best_hyperparameters)
    best_model.fit(train_matrix, y_train)
    y_test_pred = best_model.predict(test_matrix)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    ClassificationReport = classification_report(y_test, y_test_pred)

    print("Dev Set Accuracy (Best):", best_accuracy)
    print("Test Set Accuracy:", test_accuracy)
    print("\nClassification report\n", ClassificationReport)

    return best_hyperparameters, best_accuracy, test_accuracy

logistic_regression_hyperparameter(train_tfidf_matrix_two, dev_tfidf_matrix_two, test_tfidf_matrix_two, y_train, y_dev, y_test)


In [None]:
def svm_hyperparameter(train_matrix, dev_matrix, test_matrix, y_train, y_dev, y_test):
    # Define hyperp__arameter grid
    hyperparameters = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }

    best_accuracy = 0
    best_hyperparameters = None

    # Try 5 combinations
    for i in range(5):
        # Randomly select hyperparameters
        current_hyperparameters = {
            'C': random.choice(hyperparameters['C']),
            'kernel': random.choice(hyperparameters['kernel']),
            'gamma': random.choice(hyperparameters['gamma'])
        }

        # Create and train SVM model
        model = SVC(**current_hyperparameters)
        model.fit(train_matrix, y_train)

        # Evaluate on dev matrix
        y_dev_pred = model.predict(dev_matrix)
        dev_accuracy = accuracy_score(y_dev, y_dev_pred)

        # Check if current combination is the best
        if dev_accuracy > best_accuracy:
            best_accuracy = dev_accuracy
            best_hyperparameters = current_hyperparameters

    # Apply best hyperparameters on test set
    best_model = SVC(**best_hyperparameters)
    best_model.fit(train_matrix, y_train)
    y_test_pred = best_model.predict(test_matrix)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    ClassificationReport = classification_report(y_test, y_test_pred)

    print("\nBest Hyperparameters:", best_hyperparameters)
    print("Dev Set Accuracy (Best):", best_accuracy)
    print("Test Set Accuracy:", test_accuracy)
    print("\nClassification report\n", ClassificationReport)

    return best_hyperparameters, best_accuracy, test_accuracy

svm_hyperparameter(train_tfidf_matrix_two, dev_tfidf_matrix_two, test_tfidf_matrix_two, y_train, y_dev, y_test)

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers[torch]>=0.20.1


In [None]:
#BERT
import os
from google.colab import drive
drive.mount('/content/drive')

from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/bert',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/MyDrive/bertlogs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=X_train,               # training dataset
    eval_dataset=X_dev                   # evaluation dataset
)

trainer.train()
model.save()
model.eval()