In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import numpy as np
import pandas as pd
import re
import pickle
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


def preprocess(df):

  print('preprocessing')
  df['text']=df.textcontent.apply(lambda x :re.sub(r'[^\w\s]',' ',str(x)))
  df.text=df.text.apply(lambda x : re.sub(r'[^\u0600-\u06FF\s]+',' ',str(x)))
  df.text=df.text.apply(lambda x : re.sub(r'[\u06F0-\u06F9]',' ',str(x)))
  df.text=df.text.apply(lambda x : re.sub(r'[\u0660-\u0669]',' ',str(x)))
  df.text=df.text.apply(lambda x : re.sub(r'\s{2,}',' ',str(x)))
  df.text=df.text.apply(lambda x : re.sub(r'[\u200c]',' ',str(x)))

  return df

class Word2VecVectorizer:
  def __init__(self, model,embedding_size):
    print("Loading in word vectors...")
    self.word_vectors = model
    self.embedding_size=embedding_size
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    X = np.zeros((len(data), self.embedding_size))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split(' ')
      vecs = []
      m = 0
      for word in tokens:
        try:
          vec = self.word_vectors[word]
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X

  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

def language_detect():

  data=pd.read_excel("new_final_dataset.xlsx",engine="openpyxl")[['textcontent','target']]
  print("shape:",data.shape)
  data = preprocess(data)
  data.drop_duplicates(subset=['text'],keep='first',inplace=True)
  data.dropna(inplace=True)
  print(data.shape)

  x=data.text
  y=data.target

  x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y,shuffle=True)

  from gensim.models.keyedvectors import KeyedVectors
  file_path = '/content/drive/MyDrive/insta_wchr.vec'
  model = KeyedVectors.load_word2vec_format(file_path)

  vectorizer = Word2VecVectorizer(model,100)
  x_train_cv = vectorizer.fit_transform(x_train)
  x_test_cv= vectorizer.transform(x_test)

  #KNN
  from sklearn.neighbors import KNeighborsClassifier
  KNN = KNeighborsClassifier()
  param_grid = {
      'n_neighbors': [3, 5, 7],
      'weights': ['uniform', 'distance'],
      'metric': ['euclidean', 'manhattan']
    }
  grid_search = GridSearchCV(KNN, param_grid, cv=5)
  grid_search.fit(x_train_cv, y_train)
  best_knn = grid_search.best_estimator_
  best_knn.fit(x_train_cv, y_train)
  y_train_pred = best_knn.predict(x_train_cv)
  y_test_pred = best_knn.predict(x_test_cv)
  report_train = classification_report(y_train, y_train_pred)
  report_test = classification_report(y_test, y_test_pred)
  print("classification_report of KNN")
  print("on Train")
  print(report_train)
  print("on Test")
  print(report_test)

  with open('KNN_model.pkl','wb') as f_KNN:
      pickle.dump(best_knn,f_KNN)

language_detect()

shape: (22593, 2)
preprocessing
(14692, 3)
Loading in word vectors...
Finished loading in word vectors
Numer of samples with no words found: 194 / 11753
Numer of samples with no words found: 38 / 2939
classification_report of KNN
on Train
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5605
           1       1.00      1.00      1.00      6148

    accuracy                           1.00     11753
   macro avg       1.00      1.00      1.00     11753
weighted avg       1.00      1.00      1.00     11753

on Test
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1402
           1       0.97      0.93      0.95      1537

    accuracy                           0.95      2939
   macro avg       0.95      0.95      0.95      2939
weighted avg       0.95      0.95      0.95      2939

