In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("https://bit.ly/dune2reviews")

In [3]:
df = df.fillna(0)

**Task:** Create a text regression model that gives, for each review, the user rating. Test the model on a user-given review.

In [4]:
from sklearn.pipeline import Pipeline

# Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Transformer
from sklearn.feature_extraction.text import TfidfTransformer #Term frequency * inverse document frequency

# Classifier
from sklearn.neural_network import MLPClassifier

In [5]:
pipe = Pipeline([
    ('vec', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()), 
    ('nn', MLPClassifier(hidden_layer_sizes=(100,50,), activation='tanh'))
])

In [6]:
df['Custom Rating'] = df['User Rating'].apply(lambda score: "Excellent" if score>=9 else "Ok")

In [7]:
X = df['Review Content']
y = df['Custom Rating']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123)

In [9]:
pipe.fit(X_train, y_train)

In [10]:
pipe.score(X_test, y_test)

0.8084112149532711

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

In [12]:
y_pred = pipe.predict(X_test)

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   Excellent       0.81      0.93      0.87       146
          Ok       0.79      0.54      0.64        68

    accuracy                           0.81       214
   macro avg       0.80      0.74      0.76       214
weighted avg       0.81      0.81      0.80       214



In [14]:
confusion_matrix(y_test, y_pred)

array([[136,  10],
       [ 31,  37]], dtype=int64)

In [18]:
pipe.predict(["It was very long and there was too much desert."])

array(['Ok'], dtype='<U9')