# SVM

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

# Load the dataset
df = pd.read_csv('reviews.csv')

# Define a function to clean the text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Join the words back into a cleaned sentence
    cleaned_text = ' '.join(words)
    return cleaned_text

# Apply the clean_text function to the 'Review' column
df['Review'] = df['Review'].apply(clean_text)

# Display the cleaned DataFrame
print(df.head())


   Id                                             Review  Label
0   0                                   good interesting      5
1   1  class helpful currently im still learning clas...      5
2   2  likeprof tas helpful discussion among students...      5
3   3  easy follow includes lot basic important techn...      5
4   4      really nice teacheri could got point eazliy v      4


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Label'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train an SVM classifier
svm_classifier = SVC(kernel='linear')  # You can choose different kernels like 'linear', 'rbf', etc.
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)


Accuracy: 0.7689216968790881
Classification Report:
              precision    recall  f1-score   support

           1       0.53      0.36      0.43       493
           2       0.41      0.11      0.17       484
           3       0.29      0.09      0.14       933
           4       0.49      0.16      0.24      3613
           5       0.80      0.98      0.88     15881

    accuracy                           0.77     21404
   macro avg       0.50      0.34      0.37     21404
weighted avg       0.71      0.77      0.71     21404



In [3]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
accuracy_score(y_pred,y_test)*100

76.8921696879088

# CNN

In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical

# Load data
data = pd.read_csv("reviews.csv")

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        words = word_tokenize(text)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        cleaned_text = ' '.join(words)
        return cleaned_text
    else:
        return ''

data['Tidy_Reviews'] = data['Review'].apply(preprocess_text)

def convert_text_to_numerical(text):
    unique_words = len(set(' '.join(data['Tidy_Reviews']).split()))
    num_words = min(7000, unique_words)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    sequence_lengths = [len(seq) for seq in sequences]
    average_length = int(sum(sequence_lengths) / len(sequence_lengths))
    maxlen = min(140, average_length * 2)
    pad_seqs = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
    return pad_seqs, tokenizer, maxlen, num_words  # Include num_words in the return values

data = data.reset_index()
numeric_reviews, tokenizer, maxlen, num_words = convert_text_to_numerical(data['Tidy_Reviews'])
data.insert(len(data.columns)-1, "numeric_reviews", numeric_reviews.tolist())

label_encoder = LabelEncoder()
data.insert(len(data.columns), "encoded_labels", label_encoder.fit_transform(data['Label']))

# Splitting the dataset
inputs = data[['numeric_reviews']]
outputs = data[['encoded_labels']]

X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.2, shuffle=True, random_state=42)

X_train = np.asarray(X_train['numeric_reviews'].tolist(), dtype=np.int32)
X_test = np.asarray(X_test['numeric_reviews'].tolist(), dtype=np.int32)
y_train = np.asarray(y_train['encoded_labels'].tolist(), dtype=np.int32)
y_test = np.asarray(y_test['encoded_labels'].tolist(), dtype=np.int32)

# Building the CNN model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=128, input_length=maxlen, trainable=True))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='softmax'))  # Assuming you have 6 classes

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.1)])

# Evaluation
predicted_labels = model.predict(X_test)
predicted_classes = np.argmax(predicted_labels, axis=1)

cnn_accuracy = accuracy_score(y_test, predicted_classes)
print("Accuracy of CNN is", cnn_accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Accuracy of CNN is 0.7512614464586058


# LSTM

In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical

# Load data
data = pd.read_csv("reviews.csv")

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        words = word_tokenize(text)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        cleaned_text = ' '.join(words)
        return cleaned_text
    else:
        return ''

data['Tidy_Reviews'] = data['Review'].apply(preprocess_text)

def convert_text_to_numerical(text):
    unique_words = len(set(' '.join(data['Tidy_Reviews']).split()))
    num_words = min(7000, unique_words)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    sequence_lengths = [len(seq) for seq in sequences]
    average_length = int(sum(sequence_lengths) / len(sequence_lengths))
    maxlen = min(140, average_length * 2)
    pad_seqs = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
    return pad_seqs, tokenizer, maxlen, num_words  # Include num_words in the return values

data = data.reset_index()
numeric_reviews, tokenizer, maxlen, num_words = convert_text_to_numerical(data['Tidy_Reviews'])
data.insert(len(data.columns)-1, "numeric_reviews", numeric_reviews.tolist())

label_encoder = LabelEncoder()
data.insert(len(data.columns), "encoded_labels", label_encoder.fit_transform(data['Label']))

# Splitting the dataset
inputs = data[['numeric_reviews']]
outputs = data[['encoded_labels']]

X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.2, shuffle=True, random_state=42)

X_train = np.asarray(X_train['numeric_reviews'].tolist(), dtype=np.int32)
X_test = np.asarray(X_test['numeric_reviews'].tolist(), dtype=np.int32)
y_train = np.asarray(y_train['encoded_labels'].tolist(), dtype=np.int32)
y_test = np.asarray(y_test['encoded_labels'].tolist(), dtype=np.int32)

# Create the model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=128, input_length=maxlen, trainable=True))
model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(6, activation='softmax'))  # Assuming you have 6 classes

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.001)])

# Evaluation
predicted_labels = model.predict(X_test)
predicted_classes = np.argmax(predicted_labels, axis=1)

lstm_accuracy = accuracy_score(y_test, predicted_classes)
print("Accuracy of LSTM is", lstm_accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Accuracy of LSTM is 0.7737338815174734
