LinearSVC

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn import metrics

train = pd.read_csv('train.csv')
train.rating = np.where(train.rating > 6, 1, 0) 
train['review'] = train['benefits_review'] + ' ' + train['side_effects_review'] + ' ' + train['comments_review']

train_X = train.loc[:, ['review']]
train_y = train.rating
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3, stratify=train_y)
X_train_docs = [doc for doc in X_train.review]
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2), 
                                    stop_words='english')),
            ('cls', LinearSVC())
])
pipeline.fit(X_train_docs, y_train)
training_accuracy = cross_val_score(pipeline, X_train_docs, y_train, cv=5).mean()
print("Training accuracy:", training_accuracy)
predicted = pipeline.predict([doc for doc in X_test.review])
validation_accuracy = metrics.accuracy_score(y_test, predicted)
f1_score = f1_score(y_test, predicted)
print("Validation accuracy:", validation_accuracy)
print("F1 score:", f1_score)

Training accuracy: 0.7632698672853631
Validation accuracy: 0.7462365591397849
F1 score: 0.8345021037868163


In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn import metrics

train = pd.read_csv('train.csv')
train.rating = np.where(train.rating > 6, 1, 0) 
train['review'] = train['benefits_review'] + ' ' + train['side_effects_review'] + ' ' + train['comments_review']

# Convert text columns to lowercase
train['review'] = train['review'].apply(lambda x: x.lower())

train_X = train.loc[:, ['review']]
train_y = train.rating
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3, stratify=train_y)
X_train_docs = [doc for doc in X_train.review]
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2), 
                                    stop_words='english')),
            ('cls', LinearSVC())
])
pipeline.fit(X_train_docs, y_train)
training_accuracy = cross_val_score(pipeline, X_train_docs, y_train, cv=5).mean()
print("Training accuracy:", training_accuracy)
predicted = pipeline.predict([doc for doc in X_test.review])
validation_accuracy = metrics.accuracy_score(y_test, predicted)
f1_score = f1_score(y_test, predicted)
print("Validation accuracy:", validation_accuracy)
print("F1 score:", f1_score)

Training accuracy: 0.7701737955109035
Validation accuracy: 0.7430107526881721
F1 score: 0.8322807017543861


In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn import metrics
import re

train = pd.read_csv('train.csv')
train.rating = np.where(train.rating > 6, 1, 0) 
train['review'] = train['benefits_review'] + ' ' + train['side_effects_review'] + ' ' + train['comments_review']

# Convert text columns to lowercase
train['review'] = train['review'].apply(lambda x: x.lower())

# remove numbers from text
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

train['review'] = train['review'].apply(remove_numbers)

train_X = train.loc[:, ['review']]
train_y = train.rating
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3, stratify=train_y)
X_train_docs = [doc for doc in X_train.review]
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2), 
                                    stop_words='english')),
            ('cls', LinearSVC())
])
pipeline.fit(X_train_docs, y_train)
training_accuracy = cross_val_score(pipeline, X_train_docs, y_train, cv=5).mean()
print("Training accuracy:", training_accuracy)
predicted = pipeline.predict([doc for doc in X_test.review])
validation_accuracy = metrics.accuracy_score(y_test, predicted)
f1_score = f1_score(y_test, predicted)
print("Validation accuracy:", validation_accuracy)
print("F1 score:", f1_score)

Training accuracy: 0.7572695054331052
Validation accuracy: 0.7408602150537634
F1 score: 0.8301620859760395


In [30]:
# test
test = df = pd.read_csv("test.csv")

# preprossing
test.rating = np.where(test.rating > 6, 1, 0) 
test['review'] = test['benefits_review'] + ' ' + test['side_effects_review'] + ' ' + test['comments_review']

test['review'] = test['review'].apply(lambda x: x.lower())

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

test['review'] = test['review'].apply(remove_numbers)

predicted_test = pipeline.predict([doc for doc in test.review])
print("test accuracy:", accuracy_score(test.rating, predicted_test))

test accuracy: 0.7712355212355212


Logistic Regression

In [35]:
# logistic regression
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Load dataset (replace with your actual dataset)
train = pd.read_csv('train.csv')

train.rating = np.where(train.rating > 6, 1, 0) 
train['review'] = train['benefits_review'] + ' ' + train['side_effects_review'] + ' ' + train['comments_review']

train['review'] = train['review'].apply(lambda x: x.lower())

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

train['review'] = train['review'].apply(remove_numbers)

# Feature extraction using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(train['review'])
y = train['rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a Logistic Regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Compute the accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 score: {f1:.4f}")


Accuracy: 0.6935
F1 score: 0.8174


Random Forest

In [36]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Load dataset (replace with your actual dataset)
train = pd.read_csv('train.csv')

train.rating = np.where(train.rating > 6, 1, 0) 
train['review'] = train['benefits_review'] + ' ' + train['side_effects_review'] + ' ' + train['comments_review']

train['review'] = train['review'].apply(lambda x: x.lower())

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

train['review'] = train['review'].apply(remove_numbers)

# Feature extraction using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(train['review'])
y = train['rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Compute the accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 score: {f1:.4f}")


Accuracy: 0.6957
F1 score: 0.8180
