Wczytanie danych:

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split

import os
print(os.listdir("../input"))

reviews = pd.read_csv('../input/train.csv', header=0, sep=',')

reviews = reviews.Reviews

Recenzje tokenizujemy oraz dodajemy do countera:

In [None]:
from nltk import word_tokenize

words = Counter()

for i in reviews.index:
    review = reviews.iat[i]
    if type(review) == str:
        words.update(word_tokenize(review))
        
words.most_common(15)

Usuwamy stop words oraz znaki interpunkcyjne:

In [None]:
from nltk.corpus import stopwords
import string

stopwords = set(stopwords.words('english')).union(set(string.punctuation))

for word in list(words.elements()):
    if word.lower() in stopwords:
        del words[word]
del words['']
    

Przygotowujemy funkcje, która wejściowe dane zamieni na csr_matrix

In [None]:
def prepare_data(documents, features, train=False):
    row = []
    col = []
    data = []

    labels = []
    
    


    for i in documents.index:
        if type(documents.iloc[i, 4]) == str:
            document_tokens = word_tokenize(documents.iloc[i, 4])
        else:
            document_tokens = []
            
        if train:
                label = documents.iloc[i, 5]
                labels.append(int(label) / 5)
                
        if len(document_tokens) == 0:
            document_tokens = [""]
                
        document_counter = Counter()
        document_counter.update(document_tokens)
            
        for token in set(document_tokens):
            if token not in features:
                continue
            row.append(i)
            col.append(features[token])
            data.append(document_counter[token])

                
    return csr_matrix((data, (row, col)), shape=(len(documents.index), len(features))), labels

Wczytujemy dane treningowe i testowe oraz wybieramy tokeny na których będziemy budować model. Do tokenów dodajemy wartość pustą, która będzie odpowiadała pustej recenzji.
Zmienna local określa nam czy chcemy wygenerować rozwiązania (False) czy podzielić zbiór treningowy na treningowy oraz testowy (True)

In [None]:
min_word_count = 100

train_data = pd.read_csv("../input/train.csv", sep=",", header=0)
test_data = pd.read_csv("../input/test.csv", sep=",", header=0)

common_words = list([k for k, v in words.most_common() if v > min_word_count])

common_words.append("")

print("Tokens: " + str(len(common_words)))
feature_dict = {}
for word in common_words:
    feature_dict[word] = len(feature_dict)

print("Training classifier...")
X_train, y_train = prepare_data(train_data, feature_dict, True)

local = False

if local:
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
else:
    X_test, empty_list = prepare_data(test_data, feature_dict)

Budujemy model, uczymy i dokonujemy predykcji. W zależności od wartości zmiennej "local" zapisujemy rozwiązanie do pliku lub obliczamy RMSE.

In [None]:
from sklearn.preprocessing import LabelBinarizer
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import SGD

model = Sequential()
model.add(Dense(128, input_shape=(len(common_words),), activation="tanh"))
model.add(Dense(64, activation="tanh"))
model.add(Dense(1, activation="tanh"))


model.compile(loss="mse", optimizer='rmsprop', metrics=["mse"])

H = model.fit(X_train, y_train,epochs=20, batch_size=32)

predictions = model.predict(X_test, batch_size=32)

predicted = []


for i in range(len(predictions)):
    predicted.append(predictions[i][0] * 5)
    if predicted[i] > 5:
        predicted[i] = 5
    if predicted[i] < 1:
        predicted[i] = 1

        
        
if local:
    for i in range(len(y_test)):
        y_test[i] = y_test[i] * 5
    
    from math import sqrt
    from sklearn.metrics import mean_squared_error
    rmse = sqrt(mean_squared_error(y_test, predicted))
    print("RMSE: ", rmse)

if not local:
    import csv
    with open('submission.csv', 'w') as f:
        writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["Id", "Rating"])
        for i in range(len(predicted)):
            writer.writerow([test_data.iat[i,0], predicted[i]])  