In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/CI/CI-PRJ/ 

/content/drive/MyDrive/CI/CI-PRJ


## Install Usefull Packages

In [3]:
!pip install hazm



## Import Packages

In [4]:
import csv
import numpy as np
import pandas as pd
from hazm import *
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Input, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding

## Load Dataset

In [5]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
train_df.drop(columns=["Unnamed: 0"],axis=1,inplace=True)
test_df.drop(columns=["Id"],axis=1,inplace=True)

train_data = train_df.to_numpy()
test_data = test_df.to_numpy()
X, y = train_data[:, 0] , train_data[:, 1]
X_test= test_data[:, 0]

print('X_train shape:', X.shape)
print('y_train shape:', y.shape)
print('X_test shape:', X_test.shape)

X_train shape: (150096,)
y_train shape: (150096,)
X_test shape: (16678,)


## Data preprocessing

In [6]:
def clean_text(text):
    
    # Normalize the text
    normalizer = Normalizer(persian_numbers=False)
    text = normalizer.normalize(text)
    # Removing html tags
    text = re.sub('<[^>]*>', ' ', text)
    # Removing numbers
    text = re.sub('[0-9]',' ', text)
    # Removing English letters
    text = re.sub('[A-Za-z]',' ', text)
    # Removing some unnecessary persian characters
    text = re.sub('[ء]',' ', text)
    # Removing punctuations in string  
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove redundant white-spaces
    text = ' '.join(text.split())
    
    return text 

def preprocess_data(X):
    # Persian stop words list
    stop_words = stopwords_list()

    for idx in range(X.shape[0]):
        stemmer = Stemmer()
        text = X[idx]
        #Clean the text
        text = clean_text(text)
        text_words = word_tokenize(text)
        filtered_text = []
        for current_word in text_words:
            if current_word not in stop_words:
                filtered_text.append(current_word)
        # Joining words
        X[idx] = ' '.join(filtered_text)

    return X    

In [7]:
sample_text = X[0]
print('Before applying preprocessing:', sample_text)
print('After applying preprocessing:', preprocess_data(np.asarray([sample_text])))

Before applying preprocessing: 
خبرنامه دانشگاه علم و صنعت ايران 
شماره يازدهم از خبرنامه روابط عمومي دانشگاه علم و صنعت 
ايران در 48 صفحه با اخبار و مطالب علمي متنوعي از 
استادان اين دانشگاه منتشر شد. 
در اين شماره از خبرنامه، علاوه بر اختصاص صفحاتي چند به 
ديدار رئيس جمهوري از دانشگاه علم و صنعت و سخنراني در 
جمع دانشجويان اين دانشگاه، دانشكده مهندسي شيمي، 
پژوهشكده مكانيك خودرو و چند بخش ديگر اين دانشگاه معرفي 
شده است. 
تلفن روابط عمومي دانشگاه علم و صنعت ايران 7451180 و 
نشاني اينترنتي آن به قرار زير است: 
ir.ac.iust.WWW

After applying preprocessing: ['خبرنامه دانشگاه علم صنعت ایران شماره یازدهم خبرنامه روابط عمومی دانشگاه علم صنعت ایران صفحه اخبار مطالب علمی متنوعی استادان دانشگاه منتشر شماره خبرنامه اختصاص صفحاتی دیدار رئیس جمهوری دانشگاه علم صنعت سخنرانی دانشجویان دانشگاه دانشکده مهندسی شیمی پژوهشکده مکانیک خودرو دانشگاه معرفی تلفن روابط عمومی دانشگاه علم صنعت ایران نشانی اینترنتی قرار']


In [8]:
X = preprocess_data(X)
X_test = preprocess_data(X_test)

## Method 1: Convert a collection of raw documents to a matrix of TF-IDF features

In [16]:
vectorizer = TfidfVectorizer(ngram_range = (1, 2))
X_train_vectorized = vectorizer.fit_transform(X)
X_test_vectorized = vectorizer.transform(X_test)

print('X_train_vectorized shape:', X_train_vectorized.shape)
print('X_test_vectorized shape:', X_test_vectorized.shape)

X_train_vectorized shape: (150096, 11242971)
X_test_vectorized shape: (16678, 11242971)


### Split train/valid data with same distribution 

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit
stratSplit = StratifiedShuffleSplit(n_splits=20, test_size=0.05, random_state=11)
for train_idx, valid_idx in stratSplit.split(X_train_vectorized, y):
    X_train = X_train_vectorized[train_idx]
    y_train = y[train_idx]
    X_valid = X_train_vectorized[valid_idx]
    y_valid = y[valid_idx]

print('X_train shape:', X_train.shape)
print('X_valid shape:', X_valid.shape)

X_train shape: (142591, 701647)
X_valid shape: (7505, 701647)


In [17]:
X_train = X_train_vectorized
y_train = y
X_valid = X_train_vectorized
y_valid = y

### Model 1: Linear SVM

In [18]:
clf = LinearSVC()
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [19]:
y_train_pred = clf.predict(X_train)
y_valid_pred = clf.predict(X_valid)

In [20]:
from sklearn.metrics import accuracy_score
print('TRAIN ACCURACY:', accuracy_score(y_train, y_train_pred) * 100, '%')
print('VALID ACCURACY:', accuracy_score(y_valid, y_valid_pred) * 100, '%')

TRAIN ACCURACY: 99.8414348150517 %
VALID ACCURACY: 99.8414348150517 %


### Test

In [21]:
y_test_pred = clf.predict(X_test_vectorized)

In [22]:
import csv
with open('results34.csv', mode='w') as csv_file:
    csv_file = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for idx in range(y_test_pred.shape[0]):
        csv_file.writerow([idx, y_test_pred[idx]])

## Method 2: Pretrained Word Embeddings

### One-hot encoding for labels (Label preprocessing)

In [None]:
all_labels = y[:, np.newaxis]
one_hot_encoder = OneHotEncoder(sparse=False)
all_labels = one_hot_encoder.fit_transform(all_labels)

In [None]:
all_labels.shape

(150096, 34)

### Fetching the Fasttext pretrained embeddings (300 dimensions)

In [None]:
def read_fasttext_vecs(fasttext_file):
    with open(fasttext_file, encoding="utf-8") as f:
        c = 0
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            c += 1
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in (words):
            words_to_index[w] = i
            index_to_words[i] = w
            i += 1
    return words_to_index, index_to_words, word_to_vec_map

In [None]:
words_to_index, index_to_words, word_to_vec_map = read_fasttext_vecs('pretrained_embeddings.txt')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
stratSplit = StratifiedShuffleSplit(n_splits=20, test_size=0.05, random_state=42)
for train_idx, valid_idx in stratSplit.split(X, all_labels):
    X_train = X[train_idx]
    y_train = all_labels[train_idx]
    X_valid = X[valid_idx]
    y_valid = all_labels[valid_idx]

print('X_train shape:', X_train.shape)
print('X_valid shape:', X_valid.shape)

X_train shape: (142591,)
X_valid shape: (7505,)


In [None]:
maxLen = len(max(X, key=len).split(' '))

### Compute the average of embeddings for each document

In [None]:
def sentence_to_avg(sentence, word_to_vec_map):
    # Split sentence into list of lower case words
    words = sentence.split(' ')
    # Initialize the average word vector, should have the same shape as your word vectors(300d refering to dimention of pre-trained fasttext embedding file).
    avg = np.zeros((300,))
    # average the word vectors. we can loop over the words in the list "words".
    n_words_in_vocab = 0
    for w in words:
        try:
            vec=word_to_vec_map[w]
            n_words_in_vocab += 1
        except :
            vec=np.zeros((300,))
        avg += vec   
    avg = avg / len(words)
    #print(n_words_in_vocab / len(words))
    return avg

In [None]:
avg_train=[]
for i in range(len(X_train)):
    avg_train.append(sentence_to_avg(X_train[i],word_to_vec_map))

avg_valid=[]
for i in range(len(X_valid)):
    avg_valid.append(sentence_to_avg(X_valid[i],word_to_vec_map))

avg_test=[]
for i in range(len(X_test)):
    avg_test.append(sentence_to_avg(X_test[i],word_to_vec_map))

### Model 2: Neural Networks

In [None]:
n_classes = y_train.shape[1]

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(300,)))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(n_classes, activation='softmax'))

In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 512)               51712     
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 34)                17442     
Total params: 331,810
Trainable params: 331,810
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='Adam', 
              metrics=['accuracy'])

In [None]:
history = model.fit(np.array(avg_train), y_train, epochs=50, batch_size=128, validation_data=(np.array(avg_valid), y_valid))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
model.save('best-res12.h5')

### Test

In [None]:
res = model.predict(np.array(avg_test))
sol = np.zeros((np.array(avg_test).shape[0], n_classes))

for idx in range(sol.shape[0]):
    sol[idx, np.argmax(res[idx])] = 1

labels = one_hot_encoder.inverse_transform(sol)

In [None]:
import csv
with open('results14.csv', mode='w') as csv_file:
    csv_file = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)    
    for idx in range(labels.shape[0]):
        csv_file.writerow([idx, labels[idx, 0]])