In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn import metrics

train = pd.read_csv('train.csv')
train.rating = np.where(train.rating > 6, 1, 0) 
train['review'] = train['benefits_review'] + ' ' + train['side_effects_review'] + ' ' + train['comments_review']

train_X = train.loc[:, ['review']]
train_y = train.rating
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3, stratify=train_y)
X_train_docs = [doc for doc in X_train.review]
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2), 
                                    stop_words='english')),
            ('cls', LinearSVC())
])
pipeline.fit(X_train_docs, y_train)
training_accuracy = cross_val_score(pipeline, X_train_docs, y_train, cv=5).mean()
print("Training accuracy:", training_accuracy)
predicted = pipeline.predict([doc for doc in X_test.review])
validation_accuracy = metrics.accuracy_score(y_test, predicted)
f1_score = f1_score(y_test, predicted)
print("Validation accuracy:", validation_accuracy)
print("F1 score:", f1_score)

Training accuracy: 0.7632698672853631
Validation accuracy: 0.7462365591397849
F1 score: 0.8345021037868163


In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn import metrics

train = pd.read_csv('train.csv')
train.rating = np.where(train.rating > 6, 1, 0) 
train['review'] = train['benefits_review'] + ' ' + train['side_effects_review'] + ' ' + train['comments_review']

# Convert text columns to lowercase
train['review'] = train['review'].apply(lambda x: x.lower())

train_X = train.loc[:, ['review']]
train_y = train.rating
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3, stratify=train_y)
X_train_docs = [doc for doc in X_train.review]
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2), 
                                    stop_words='english')),
            ('cls', LinearSVC())
])
pipeline.fit(X_train_docs, y_train)
training_accuracy = cross_val_score(pipeline, X_train_docs, y_train, cv=5).mean()
print("Training accuracy:", training_accuracy)
predicted = pipeline.predict([doc for doc in X_test.review])
validation_accuracy = metrics.accuracy_score(y_test, predicted)
f1_score = f1_score(y_test, predicted)
print("Validation accuracy:", validation_accuracy)
print("F1 score:", f1_score)

Training accuracy: 0.7701737955109035
Validation accuracy: 0.7430107526881721
F1 score: 0.8322807017543861


In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn import metrics
import re

train = pd.read_csv('train.csv')
train.rating = np.where(train.rating > 6, 1, 0) 
train['review'] = train['benefits_review'] + ' ' + train['side_effects_review'] + ' ' + train['comments_review']

# Convert text columns to lowercase
train['review'] = train['review'].apply(lambda x: x.lower())

# remove numbers from text
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

train['review'] = train['review'].apply(remove_numbers)

train_X = train.loc[:, ['review']]
train_y = train.rating
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3, stratify=train_y)
X_train_docs = [doc for doc in X_train.review]
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2), 
                                    stop_words='english')),
            ('cls', LinearSVC())
])
pipeline.fit(X_train_docs, y_train)
training_accuracy = cross_val_score(pipeline, X_train_docs, y_train, cv=5).mean()
print("Training accuracy:", training_accuracy)
predicted = pipeline.predict([doc for doc in X_test.review])
validation_accuracy = metrics.accuracy_score(y_test, predicted)
f1_score = f1_score(y_test, predicted)
print("Validation accuracy:", validation_accuracy)
print("F1 score:", f1_score)

Training accuracy: 0.7604963761560647
Validation accuracy: 0.7741935483870968
F1 score: 0.8506401137980085


In [28]:
# test
test = df = pd.read_csv("test.csv")

# preprossing
test.rating = np.where(test.rating > 6, 1, 0) 
test['review'] = test['benefits_review'] + ' ' + test['side_effects_review'] + ' ' + test['comments_review']

test['review'] = test['review'].apply(lambda x: x.lower())

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

test['review'] = test['review'].apply(remove_numbers)

predicted_test = pipeline.predict([doc for doc in test.review])
print("test accuracy", accuracy_score(test.rating, predicted_test))

test accuracy 0.7712355212355212
