## Import functionality

In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV
from time import time
from os.path import join
import warnings
from sklearn.metrics import accuracy_score
import pickle
from typing import Any
import torch
from torch import nn, tensor
from torch.utils.data import Dataset, DataLoader
import gc
device = torch.device('cuda')

## Load data

In [48]:
SOURCE_DATA_FOLDER = ['..','datasets']

df_sentiment = pd.read_csv(join(*SOURCE_DATA_FOLDER,'sentiment.csv'))

y_set = np.array(list(df_sentiment['SCORE']))
X_set = df_sentiment.to_numpy()[:,1:]

In [49]:
def load_or_calc(name:str, search:HalvingGridSearchCV, X:Any, y:Any) -> HalvingGridSearchCV:
    # Saved-state file
    fn = name+'.bin'
    try:
        # Try opening saverd state.
        with open(fn,'rb') as f:
            return pickle.load(f)
    except:
        # Search best hyperparams.
        warnings.filterwarnings('ignore')
        search.fit(X, y)
        warnings.filterwarnings('default')
        # Save state
        with open(fn,'wb') as f:
            pickle.dump(search,f)
        return search

## Split data

In [50]:
train_size = int(0.75 * y_set.shape[0])
train_size = train_size - train_size%128
print('Rows:',y_set.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X_set,y_set, train_size=train_size)

print('Train X/y:',X_train.shape, y_train.shape)
print('Test X/y:',X_test.shape, y_test.shape)

METHODS = {}

Rows: 21440
Train X/y: (16000, 652) (16000,)
Test X/y: (5440, 652) (5440,)


## Test LogisticRegression

In [51]:
# Name
log_reg_name = 'LogisticRegression'
# Possible LogisticRegression parameters.
params = {
    'penalty': ['l1','l2','elasticnet'],
    'C': np.logspace(-4,4,10),
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'max_iter': [100,1000,2500,5000]
}
# Parameter optimizer.
classifier = HalvingGridSearchCV(
    LogisticRegression(),
    param_grid=params,
    cv=5,
    random_state=int(time()),
    n_jobs=3,
    verbose=1)

In [52]:
# Calculate or load saved.
classifier = load_or_calc(log_reg_name, classifier, X_train, y_train)

In [53]:
print('Estimator:', classifier.best_estimator_)
predicted = classifier.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print("Accuracy:", accuracy)
METHODS[log_reg_name] = accuracy

Estimator: LogisticRegression(C=2.782559402207126, max_iter=5000, solver='newton-cholesky')
Accuracy: 0.8216911764705882




## Test LinearSVC

In [54]:
# Name
lsvc_name = 'LinearSVC'
# Possible ElasticNet parameters.
params = {
    'penalty': ['l1','l2'],
    'loss': ['hinge', 'squared_hinge'],
    'C': np.arange(0.1,50,10),
    'max_iter': [100,1000,2500,5000]
}
# Parameter optimizer.
classifier = HalvingGridSearchCV(
    LinearSVC(),
    param_grid=params,
    cv=5,
    random_state=int(time()),
    n_jobs=3,
    verbose=1)

In [55]:
# Calculate or load saved.
classifier = load_or_calc(lsvc_name, classifier, X_train, y_train)

In [56]:
print('Estimator:', classifier.best_estimator_)
predicted = classifier.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print("Accuracy:", accuracy)
METHODS[lsvc_name] = accuracy

Estimator: LinearSVC(C=10.1, max_iter=5000)
Accuracy: 0.4158088235294118




## Test neural net Module

In [57]:
# Name
nn_name = 'SentimentNN'

class SentimentNN(nn.Module):
    def __init__(self, num_features:int=None) -> None:
        super().__init__()
        if num_features:
            self.fc = nn.Sequential(
                nn.Linear(num_features, 256),
                nn.ReLU(),
                nn.Linear(256,128),
                nn.ReLU(),
                nn.Linear(128,1)
            )
    def forward(self,x):
        return self.fc(x)

class SentimentData(Dataset):
    def __init__(self, X, y) -> None:
        self.X = X
        self.y = y
    def __getitem__(self, index) -> Any:
        return (
            torch.tensor(self.X[index], dtype=torch.float32, device=device), 
            torch.tensor(self.y[index], dtype=torch.float32, device=device)
        )
    def __len__(self):
      return len(self.X)

In [63]:
try:
    model = SentimentNN()
    model.load_state_dict(torch.load("SentimentNN.bin"))
except:
    sentiment_model = None

if not sentiment_model:
    # Data batching
    data_set = SentimentData(X_train, y_train)
    loader = DataLoader(data_set, batch_size=768, shuffle=True)

    sentiment_model = SentimentNN(X_train.shape[1]).to(device=device, dtype=torch.float32)
    optimizer = torch.optim.Adam(sentiment_model.parameters())
    criterion = nn.MSELoss()

    epochs = 150
    for epoc in range(epochs):
        optimizer.zero_grad()
        for X_,y_ in loader:
            predicted = sentiment_model(X_)
            predicted = predicted.reshape(y_.shape[0])
            loss = criterion(predicted, y_)
            loss.backward()
            optimizer.step()

            gc.collect()
            torch.cuda.empty_cache()
    
        print('Epoc:',epoc)
        print('Loss:',loss)
    torch.save(sentiment_model.state_dict(), 'SentimentNN.bin')

predicted = sentiment_model(tensor(X_test, device=device, dtype=torch.float32))
print(predicted)


Epoc: 0
Loss: tensor(0.3122, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 1
Loss: tensor(1.1507, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 2
Loss: tensor(0.6971, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 3
Loss: tensor(0.5523, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 4
Loss: tensor(0.3456, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 5
Loss: tensor(0.2045, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 6
Loss: tensor(0.1380, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 7
Loss: tensor(0.1655, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 8
Loss: tensor(0.2978, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 9
Loss: tensor(0.1478, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 10
Loss: tensor(0.0515, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 11
Loss: tensor(0.0509, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 12
Loss: tensor(0.0419, device='cuda:0', grad_fn=<MseLossBackward0>)
Epoc: 13
Loss: tensor(0.0301, devic