In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv
/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip
/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip


In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F

dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
import pandas as pd
import numpy as np
import torch
import time
import os
import torch.nn as nn
import torchtext.vocab as vocab
import torchtext.data as data
from torchtext.data import *
import torch.nn.functional as F
import torch.optim as optim
import spacy
from keras.preprocessing import text
from keras.utils import pad_sequences
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [4]:
seed = 42
BATCH_SIZE = 64
torch.manual_seed(seed)
max_len = 50
torch.backends.cudnn.deterministic = True
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
glove_path = "../input/glove840b300dtxt/glove.840B.300d.txt"

In [5]:
def preprocess(data):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text.lower()
    
    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [6]:
train_data = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep ='\t')
test_data = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip', sep ='\t')
submission = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv')

In [7]:
x_train = preprocess(train_data['Phrase'])
x_test = preprocess(test_data['Phrase'])
y_train = train_data['Sentiment']

In [8]:
sentences = list(x_train )
labels = list(y_train )

In [9]:
PAD = ' <PAD>'  # 未知字，padding符号用来填充长短不一的句子
pad_size =  50     # 每句话处理成的长度(短填长切)

for i in range(len(sentences)):
    sen2list = sentences[i].split()
    sentence_len = len(sen2list)
    if sentence_len<pad_size:
        sentences[i] += PAD*(pad_size-sentence_len)
    else:
        sentences[i] = " ".join(sen2list[:pad_size])


In [10]:
# TextCNN Parameter
num_classes = len(set(labels))  # num_classes=2
batch_size = 64
word_list = " ".join(sentences).split()
vocab = list(set(word_list))
word2idx = {w: i for i, w in enumerate(vocab)}
vocab_size = len(vocab)


In [11]:
def make_data(sentences, labels):
    inputs = []
    for sen in sentences:
        inputs.append([word2idx[n] for n in sen.split()])

    targets = []
    for out in labels:
        targets.append(out) # To using Torch Softmax Loss function
    return inputs, targets
input_batch, target_batch = make_data(sentences, labels)
input_batch, target_batch = torch.LongTensor(input_batch), torch.LongTensor(target_batch)

In [12]:
from sklearn.model_selection import train_test_split
# 划分训练集，测试集
x_train,x_test,y_train,y_test = train_test_split(input_batch,target_batch,test_size=0.2,random_state = 0)

train_dataset = Data.TensorDataset(torch.tensor(x_train), torch.tensor(y_train))
test_dataset = Data.TensorDataset(torch.tensor(x_test), torch.tensor(y_test))
dataset = Data.TensorDataset(input_batch, target_batch)


  """
  


In [13]:
train_loader = Data.DataLoader(
    dataset=train_dataset,      # 数据，封装进Data.TensorDataset()类的数据
    batch_size=batch_size,      # 每块的大小
    shuffle=True,               # 要不要打乱数据 (打乱比较好)
    #num_workers=2,              # 多进程（multiprocess）来读数据
)
test_loader = Data.DataLoader(
    dataset=test_dataset,      # 数据，封装进Data.TensorDataset()类的数据
    batch_size=batch_size,      # 每块的大小
    shuffle=True,               # 要不要打乱数据 (打乱比较好)
    #num_workers=2,              # 多进程（multiprocess）来读数据
)


In [14]:
class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.filter_sizes = (2, 3, 4)
        self.embed = 300
        self.num_filters = 256
        self.dropout = 0.5
        self.num_classes = num_classes
        self.n_vocab = vocab_size
        #通过padding_idx将<PAD>字符填充为0，因为他没意义哦，后面max-pooling自然而然会把他过滤掉哦
        self.embedding = nn.Embedding(self.n_vocab, self.embed, padding_idx=word2idx['<PAD>'])
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, self.num_filters, (k, self.embed)) for k in self.filter_sizes])
        
        self.dropout = nn.Dropout(self.dropout)
        self.fc = nn.Linear(self.num_filters * len(self.filter_sizes), self.num_classes)
        
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
        
    def forward(self, x):
        out = self.embedding(x)
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out


In [None]:
model = TextCNN().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training
for epoch in range(10):
    acc = 0
    for i, batch in enumerate(train_loader):
        batch_x, batch_y = batch
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        pred = model(batch_x)
        loss = criterion(pred, batch_y)
        #acc += pred.eq(batch_y.view_as(pred)).sum().item()
        if (i + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss),'acc=','{:.3f}'.format(acc))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [None]:
test_acc_list = []
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
     for i, batch in enumerate(test_loader):
        batch_x, batch_y = batch
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        output = model(batch_x)
#         loss = criterion(output, target)
#         test_loss += F.nll_loss(output, target, reduction='sum').item() # 将一批的损失相加

        pred = output.max(1, keepdim=True)[1]                           # 找到概率最大的下标
        correct += pred.eq(batch_y.view_as(pred)).sum().item()

# test_loss /= len(test_loader.dataset)
# test_loss_list.append(test_loss)
test_acc_list.append(100. * correct / len(test_loader.dataset))
print('Accuracy: {}/{} ({:.2f}%)\n'.format(correct, len(test_loader.dataset),100. * correct / len(test_loader.dataset)))
