In [1]:
import pandas as pd
import re
from google.colab import drive
import string
import pickle
import numpy as np
import gensim

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from numpy.core.fromnumeric import shape
from keras.layers import Dense, Input, Dropout, Flatten, Embedding, CuDNNLSTM, LSTM
from keras.models import Sequential

In [2]:
drive.mount('/gdrive')
DATASET_PATH = '/gdrive/MyDrive/processed_dialect_dataset.pkl'
VECTORIZER_PATH = '/gdrive/MyDrive/tfidf_vecctorizer.pkl'
W2V_MODEL_PATH = '/gdrive/MyDrive/model4.bin'
TOKENIZER_PATH = '/gdrive/MyDrive/tokenizer.pkl'
EMBEDDING_MATRIX_PATH = '/gdrive/MyDrive/embedding_matrix.pkl'

Mounted at /gdrive


In [3]:
#! pwd

In [4]:
!pip install import_ipynb
import import_ipynb
% cd '/gdrive/MyDrive/Colab Notebooks/'
import data_preprocessing
% cd '/content'

Collecting import_ipynb
  Downloading import-ipynb-0.1.3.tar.gz (4.0 kB)
Building wheels for collected packages: import-ipynb
  Building wheel for import-ipynb (setup.py) ... [?25l[?25hdone
  Created wheel for import-ipynb: filename=import_ipynb-0.1.3-py3-none-any.whl size=2975 sha256=ee1d51d460938c86df820be3ce90eb04de8f16d40f018a812405cfe5d5dd08b5
  Stored in directory: /root/.cache/pip/wheels/b1/5e/dc/79780689896a056199b0b9f24471e3ee184fbd816df355d5f0
Successfully built import-ipynb
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.3
/gdrive/MyDrive/Colab Notebooks
importing Jupyter notebook from data_preprocessing.ipynb
Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/content


In [5]:
# Load dataset
with open(DATASET_PATH, 'rb') as f:
  dialect_df = pickle.load(f)

# Load vectorizer
with open(VECTORIZER_PATH, 'rb') as f:
  vectorizer = pickle.load(f)

# Load tokenizer
with open(TOKENIZER_PATH, 'rb') as f:
  tokenizer = pickle.load(f)

# Load embedding matrix
with open(EMBEDDING_MATRIX_PATH, 'rb') as f:
  embed_matrix = pickle.load(f)

In [6]:
def show_metrics(y_test, y_pred):
  ac = accuracy_score(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  print("Accuracy is :",ac)
  print(classification_report(y_test, y_pred))

In [7]:
# Separate Feature from label
X = dialect_df['text'].values
X_tokenized = dialect_df['tokenized_string']
y = dialect_df['dialect']

# Encoding classes of label
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=123)

### **TF-IDF with SGD Classifier**

In [None]:
# Transform data
X_train_transformed = vectorizer.transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

# Train SGD
clf = SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(X_train_transformed, y_train)

# Prediction on test data
y_pred = clf.predict(X_test_transformed)

In [None]:
# Show metrics
show_metrics(y_test, y_pred)

Accuracy is : 0.4928961152335225
              precision    recall  f1-score   support

           0       0.43      0.37      0.40      5259
           1       0.38      0.25      0.30      5258
           2       0.53      0.48      0.50      3237
           3       0.54      0.90      0.68     11527
           4       0.52      0.55      0.54      3099
           5       0.41      0.22      0.29      5584
           6       0.48      0.56      0.52      8422
           7       0.51      0.69      0.59      5524
           8       0.57      0.67      0.62      7300
           9       0.54      0.60      0.57      2308
          10       0.43      0.28      0.34      3823
          11       0.48      0.45      0.47      8749
          12       0.46      0.43      0.45      6214
          13       0.43      0.34      0.38      5367
          14       0.55      0.39      0.45      2887
          15       0.38      0.20      0.26      3248
          16       0.53      0.39      0.44     

### **Word Embeddings with SGD Classifier**

In [None]:
# Initialize object from class
w2v = data_preprocessing.WORD2VECTOR(300, 100)

# Load w2v model
w2v_model = w2v.load_model(W2V_MODEL_PATH)

# Trasnform
X_transformed = w2v.transform(X_tokenized)

In [None]:
# Splitting with respect to new input
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, stratify=y, test_size=0.20, random_state=123)

In [None]:
# Train SGD
clf2 = SGDClassifier(max_iter=1000, tol=1e-3)
clf2.fit(X_train, y_train)

# Prediction on test data
y_pred = clf2.predict(X_test)

In [None]:
# Show metrics
show_metrics(y_test, y_pred)

Accuracy is : 0.3668267132256656
              precision    recall  f1-score   support

           0       0.24      0.08      0.11      5259
           1       0.13      0.38      0.20      5258
           2       0.47      0.22      0.30      3237
           3       0.64      0.85      0.73     11527
           4       0.44      0.48      0.46      3099
           5       0.17      0.22      0.19      5584
           6       0.38      0.24      0.29      8422
           7       0.45      0.67      0.54      5524
           8       0.48      0.64      0.55      7300
           9       0.67      0.51      0.58      2308
          10       0.19      0.12      0.15      3823
          11       0.49      0.18      0.26      8749
          12       0.28      0.34      0.31      6214
          13       0.21      0.09      0.12      5367
          14       0.52      0.43      0.47      2887
          15       0.24      0.06      0.09      3248
          16       0.35      0.10      0.15     

### **Word Embeddings with LSTM**

In [None]:
dummy_y = np_utils.to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, stratify=dummy_y, test_size=0.20, random_state=123)

# Load Preprocessing class for padding
dp = data_preprocessing.DataPreProcessing()
longest_sequence = max(dialect_df['tokenized_string'].apply(len))
unique_words_count = w2v.num_of_unique_words

In [None]:
X_train_pad = dp.pad_data(X_train, longest_sequence, tokenizer)
X_test_pad = dp.pad_data(X_test, longest_sequence, tokenizer)

In [None]:
# Model Architecture
EMBEDDING_DIM = 300
model = Sequential()
model.add(Embedding(input_dim = unique_words_count, output_dim = EMBEDDING_DIM, input_length= X_train_pad.shape[1], weights = [embed_matrix],trainable = False))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.05))
model.add(LSTM(64))
model.add(Dropout(0.1))
model.add(Dense(18, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
model.fit(X_train_pad, y_train, epochs=20, batch_size=256, validation_split=.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3d42488910>

In [None]:
y_pred = model.predict(X_test_pad)
y_test_class = np.argmax(y_test, axis=1)
print(classification_report(y_test_class, np.argmax(y_pred, axis=1)))

              precision    recall  f1-score   support

           0       0.37      0.32      0.34      5259
           1       0.28      0.17      0.21      5258
           2       0.54      0.47      0.51      3237
           3       0.68      0.84      0.75     11527
           4       0.50      0.51      0.50      3099
           5       0.39      0.27      0.32      5584
           6       0.39      0.54      0.46      8422
           7       0.57      0.63      0.60      5524
           8       0.58      0.68      0.62      7300
           9       0.69      0.58      0.63      2308
          10       0.32      0.28      0.30      3823
          11       0.44      0.51      0.48      8749
          12       0.48      0.34      0.40      6214
          13       0.30      0.41      0.35      5367
          14       0.66      0.53      0.59      2887
          15       0.40      0.23      0.29      3248
          16       0.57      0.40      0.47      1849
          17       0.24    

In [None]:
# This section includes saving all models
# Save label encoder
#with open('/gdrive/MyDrive/label_encoder.pkl','wb') as f:
#  pickle.dump(le, f)

# Save sgd classifier with tfidf
#with open('/gdrive/MyDrive/sgd_tfidf', 'wb') as f:
#  pickle.dump(clf, f)

# Save sgd classifier with word embeddings
#with open('/gdrive/MyDrive/sgd_word_embeddings', 'wb') as f:
#  pickle.dump(clf2, f)

# Save deep neural network
#model.save('/gdrive/MyDrive/dnn_model')