###  采用RNN进行情绪分析

####  一、导入相应的库

In [1]:
import numpy as np 
import torch 
import torch.nn as nn

In [2]:
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
    
    
with open('data/labels.txt','r') as f:
    labels = f.read()

In [3]:
print(reviews[:2000])
print('------------')
print(labels[:20])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

####  二、数据预处理（删除特殊字符）

In [4]:
from string import punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
reviews = reviews.lower()
all_text = ''.join([c for c in reviews if c not in punctuation])

In [6]:
print(all_text[:200])

bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s 


#### 详细步骤

先将对应的按行分割的文本分开 去除掉'\n' 

再将其拼接起来形成一个大的文本

依据空格分割 将大文本变成一个大的词表

In [7]:
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)
words = all_text.split()
print(words[:30])

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such', 'as', 'teachers', 'my', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me']


In [8]:
print(reviews_split[0])

bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   


####  对单词进行编码

In [9]:
from collections import Counter

In [10]:
counts = Counter(words)
vocab = sorted(counts,key = counts.get ,reverse = True)
vocab_to_int = {word:li for li,word in enumerate(vocab,1)}

In [11]:
reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [12]:
print(len(vocab_to_int))
print(reviews_ints[:1])

74072
[[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23]]


####  对标签进行编码

In [14]:
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

###  过滤掉特别长和特别短的文本

In [15]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [16]:
print('Number of reviews before removing outliers: ', len(reviews_ints))

non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(reviews_ints))

Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


#### padding

####  即是采用先采用np.zeros()得到整个全为0的序列，再用已有的数据对序列中的值进行填充

In [24]:
def pad_features(reviews_ints,seq_length):
    features = np.zeros((len(reviews_ints),seq_length),dtype = int)
    
    for i ,row in enumerate(reviews_ints):
        features[i,:len(row)] = np.array(row)[:seq_length]
    return features

In [25]:
seq_length = 200

features = pad_features(reviews_ints, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 10 values of the first 30 batches 
print(features[:30,:10])

[[21025   308     6     3  1050   207     8  2138    32     1]
 [   63     4     3   125    36    47  7472  1395    16     3]
 [22382    42 46418    15   706 17139  3389    47    77    35]
 [ 4505   505    15     3  3342   162  8312  1652     6  4819]
 [  520   119   113    34 16372  1816  3737   117   885 21030]
 [   11    20  3637   141    10   422    23   272    60  4355]
 [   11     6   692     1    90  2156    20 11728     1  2818]
 [  786   295    10   122    11     6   419     5    29    35]
 [   11     6    24     1   779  3687  2818    20     8    14]
 [   54    10    14   116    60   798   552    71   364     5]
 [   11   215    23     1  1686  2069  1565   867     6     8]
 [    8   725    12   109  1384   168     1   322     4     3]
 [  415    92    35   482     5  2935    94     3   539  1765]
 [    1   330   578    34     3   162   748  2731     9   325]
 [    9    11 10171  5305  1946   689   444    22   280   673]
 [   10    89    23   122    36     5  1801     1 11733

#### 数据集划分

In [26]:
split_frac = 0.8

split_idx = int(len(features) * 0.8)
train_x , remaining_x = features[:split_idx] , features[split_idx:]
train_y , remaining_y = encoded_labels[:split_idx] , features[split_idx:]

test_idx = int(len(remaining_x) * 0.5)
val_x ,test_x = remaining_x[:test_idx] , remaining_x[test_idx:]
val_y ,test_y = remaining_y[:test_idx] , remaining_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


####  采用torch的dataloader 进行 load数据以及batch化

In [28]:
import torch
from torch.utils.data import TensorDataset,DataLoader

train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x),torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x),torch.from_numpy(test_y))

batch_size = 50

train_loader = DataLoader(train_data,shuffle = True , batch_size = batch_size)
val_loader = DataLoader(valid_data,shuffle = True , batch_size = batch_size)
test_loader = DataLoader(test_data , shuffle = True , batch_size = batch_size)

In [29]:
dataiter = iter(train_loader)
sample_x ,sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[    10,    143,     30,  ...,      0,      0,      0],
        [   216,     11,     60,  ...,      0,      0,      0],
        [ 10009,  19820,   2408,  ...,      0,      0,      0],
        ...,
        [   407,    215,     23,  ...,      0,      0,      0],
        [    10,     68,     43,  ...,      0,      0,      0],
        [  8332,    550,      8,  ...,      0,      0,      0]], dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([ 0,  1,  1,  1,  1,  0,  1,  0,  0,  1,  1,  1,  1,  0,
         0,  1,  0,  1,  0,  1,  1,  0,  0,  0,  1,  0,  0,  0,
         0,  1,  0,  1,  1,  0,  0,  1,  1,  0,  0,  1,  0,  1,
         1,  0,  0,  0,  1,  1,  0,  0], dtype=torch.int32)
