### 데이터 전처리

In [1]:
max_length = 256 # sms 최대 길이

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('sms.tsv', sep = '\t', )
print(df.columns)
print(df.shape)

Index(['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'], dtype='object')
(5574, 2)


In [4]:
df.columns = ["label", "sms"]

In [5]:
df.head()

Unnamed: 0,label,sms
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,spam,"%^^×？×^×&#****,>,;//×&>>*(*^%=÷#~^&,****)"
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
classes = sorted(set(df['label']))
class_to_idx = {}

for i, c in enumerate(classes):
  class_to_idx.update({c:i})

nclass = len(classes)

print("# of classes : %d" % nclass)
print(classes)
print(class_to_idx)

# of classes : 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


In [7]:
new_df = pd.DataFrame({'label':df['label'],
                       'sms':df['sms'].str.slice(
                           start=0, stop=max_length
                       )})

In [8]:
len(new_df)

5574

In [9]:
new_df = pd.DataFrame(new_df.drop_duplicates())

In [10]:
len(new_df)

5171

In [11]:
df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
print(df_shuffled.shape[0])
df_shuffled.head()

5171


Unnamed: 0,label,sms
0,ham,How's it going? Got any exciting karaoke type ...
1,ham,Gd luck 4 ur exams :-)
2,ham,After my work ah... Den 6 plus lor... U workin...
3,ham,"Are you sure you don't mean ""get here, we made..."
4,ham,I agree. So i can stop thinkin about ipad. Can...


In [12]:
# train: test = 9:1
train_ratio = 0.9

# train dataset
s, e = 0, int(df_shuffled.shape[0] * train_ratio)
df_train = pd.DataFrame({'label':df_shuffled['label'][s:e],
                         'sms':df_shuffled['sms'][s:e]})
print("index for train: %d~%d" %(s,e))

# test dataset
s, e = e, e+int(df_shuffled.shape[0] * (1.0 - train_ratio))
print("index for test: %d~%d" %(s,e))
df_test = pd.DataFrame({'label':df_shuffled['label'][s:e],
                        'sms':df_shuffled['sms'][s:e]})

index for train: 0~4653
index for test: 4653~5170


In [13]:
# column 수 확인
print(df_train.shape)
print(df_test.shape)

(4653, 2)
(517, 2)


In [14]:
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv',
                header=False, index=False, sep='\t')

df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv',
               header=False, index=False, sep='\t')

### 데이터 로더

In [15]:
import torch

In [16]:
print(torch.__version__)

1.9.0+cu102


In [17]:
!pip install torchtext==0.4.0

Collecting torchtext==0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/43/94/929d6bd236a4fb5c435982a7eb9730b78dcd8659acf328fd2ef9de85f483/torchtext-0.4.0-py3-none-any.whl (53kB)
[K     |██████▏                         | 10kB 16.1MB/s eta 0:00:01[K     |████████████▍                   | 20kB 8.2MB/s eta 0:00:01[K     |██████████████████▌             | 30kB 7.4MB/s eta 0:00:01[K     |████████████████████████▊       | 40kB 6.7MB/s eta 0:00:01[K     |██████████████████████████████▉ | 51kB 3.0MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.5MB/s 
Installing collected packages: torchtext
  Found existing installation: torchtext 0.10.0
    Uninstalling torchtext-0.10.0:
      Successfully uninstalled torchtext-0.10.0
Successfully installed torchtext-0.4.0


In [18]:
import torchtext
import numpy as np

In [19]:
from data_loader import DataLoader

### RNN + sms 구현

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
loaders = DataLoader(
    train_fn = './sms.maxlen.uniq.shuf.train.tsv',
    batch_size = batch_size,
    valid_ratio = .2,
    device = -1, 
    max_vocab = 999999,
    min_freq = 5,
)

In [27]:
test_loaders = DataLoader(
    train_fn = './sms.maxlen.uniq.shuf.train.tsv',
    batch_size = batch_size,
    valid_ratio = .01,
    device = -1,
    max_vocab = 999999,
    min_freq = 5,
)