Load data

In [1]:
import pandas as pd
import numpy as np
import re
import random
from decision_tree_functions import decision_tree_algorithm, decision_tree_predictions
from helper_functions import train_test_split, calculate_accuracy

In [2]:
df = pd.read_csv("name_full.csv")
df.head()

Unnamed: 0,Full_Name,Gender
0,Ngô Xuân Tùng,1
1,Bùi Dương Thảo Vy,0
2,Lưu Thế Huy,1
3,Nguyễn Thị Vân,0
4,Dương Minh Long,1


Data preprocessing

In [3]:
full_data=df['Full_Name']
labels = df['Gender']

In [4]:
def lowerize(text):
  patterns = {
  '[àáảãạăắằẵặẳâầấậẫẩ]': 'a',
  '[đ]': 'd',
  '[èéẻẽẹêềếểễệ]': 'e',
  '[ìíỉĩị]': 'i',
  '[òóỏõọôồốổỗộơờớởỡợ]': 'o',
  '[ùúủũụưừứửữự]': 'u',
  '[ỳýỷỹỵ]': 'y'
  }
  output = text
  for regex, replace in patterns.items():
    output = re.sub(regex, replace, output)
    # deal with upper case
    output = re.sub(regex.upper(), replace.upper(), output)
  return output.lower()

In [5]:
lowerize("Phạm Quang Tùng")

'pham quang tung'

In [6]:
class TF_IDF():
  def __init__(self, corpus, dictionary=None, max_count=None, min_count=None, normalize_tf=False, smooth=True, normalize_tfidf=None):
    self.corpus=corpus
    self.max_count=max_count
    self.min_count=min_count
    self.normalize_tf=normalize_tf
    self.smooth=smooth
    self.normalize_tfidf=normalize_tfidf
    self.dictionary = dictionary if dictionary!=None else self.create_dictionary()
    self.num_word=len(self.dictionary)
    self.word_to_index = self.map_word_to_index()
    self.num_document = len(self.corpus)
    self.matrix_word_count = self.create_count_matrix()
  #return the word in dictionary given index
  def retrieve_word(self, index):
    return self.dictionary[index]
  def create_dictionary(self):
    if self.max_count==None and self.min_count==None:
      set_word = set()
      for doc in self.corpus:
        set_word = set_word.union(set(self.word_extraction(doc.lower())))
    else:
      set_word = set()
      map_word_count = self.map_word_to_count()
      for doc in self.corpus:
        list_word=self.word_extraction(doc.lower())
        for word in list_word:
          if self.min_count!=None:
            if map_word_count[word] < self.min_count:
              continue
          if self.max_count!=None:
            if map_word_count[word] > self.max_count:
              continue
          set_word.add(word)
    return sorted(list(set_word))
  def retrieve_index(self, word):
    return self.word_to_index[word.lower()]
  
  def word_extraction(self, document):
    split_word=document.split()
    return split_word

  def map_word_to_count(self):
    dict_word_count = dict()
    for i in range(len(self.corpus)):
      list_word = self.word_extraction(self.corpus[i].lower())
      for j in range(len(list_word)):
        dict_word_count[list_word[j]] = dict_word_count.get(list_word[j],0)+1
    return dict_word_count
  
  def map_word_to_index(self):
    dict_encode=dict()
    for i in range(len(self.dictionary)):
      dict_encode[self.dictionary[i]]=i
    return dict_encode
  
  def create_count_matrix(self):
    mat = np.zeros((self.num_document, self.num_word))
    for i in range(len(self.corpus)):
      document = self.corpus[i].lower()
      list_word = self.word_extraction(document)
      for j in range(len(list_word)):
        ind = self.retrieve_index(list_word[j])
        mat[i, ind]+=1
    return mat

  def compute_tf(self):
    length_name = np.sum(self.matrix_word_count, axis=1)
    if self.normalize_tf==True:
      return self.matrix_word_count/np.reshape(length_name, (-1,1))
    else:
      return self.matrix_word_count
  
  def compute_idf(self):
    tmp = np.copy(self.matrix_word_count)
    tmp[tmp!=0]=1
    num_doc_having_word = np.sum(tmp, axis=0)
    if self.smooth == True:
      num_doc_having_word = np.log((self.num_document+1) / (num_doc_having_word+1)) + 1
    else:
      num_doc_having_word = np.log(self.num_document / num_doc_having_word) + 1
    
    return np.reshape(num_doc_having_word, (1, self.num_word))
  def compute_tf_idf(self):
    tf = self.compute_tf()
    idf = self.compute_idf()
    tfidf = tf * idf
    if self.normalize_tfidf==None:
      return tfidf
    elif self.normalize_tfidf == "l2":
      sum_squares = np.reshape(np.diag(tfidf.dot(tfidf)), (1, -1))
      return tfidf / sum_squares
    elif self.normalize_tfidf == "l1":
      sum_row = np.reshape(np.sum(tfidf, axis=1), (1, -1))
      return tfidf / sum_row

In [7]:
list_name = ["nguyen nam hai", "pham quang tung", "doan the vinh", "nguyen ba thiem"]
test_TF_IDF = TF_IDF(list_name)
print(test_TF_IDF.dictionary)
print(test_TF_IDF.compute_tf_idf())

['ba', 'doan', 'hai', 'nam', 'nguyen', 'pham', 'quang', 'the', 'thiem', 'tung', 'vinh']
[[0.         0.         1.91629073 1.91629073 1.51082562 0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         1.91629073
  1.91629073 0.         0.         1.91629073 0.        ]
 [0.         1.91629073 0.         0.         0.         0.
  0.         1.91629073 0.         0.         1.91629073]
 [1.91629073 0.         0.         0.         1.51082562 0.
  0.         0.         1.91629073 0.         0.        ]]


In [8]:
list_of_names=full_data.tolist()
list_of_labels = labels.to_numpy()
print(len(full_data))

26851


In [9]:
TF_IDF_full_data = TF_IDF(list_of_names)
onehot_data = np.array(TF_IDF_full_data.matrix_word_count)
#svd = TruncatedSVD(n_components=500)
#normalizer = Normalizer(copy=False)
#X_normalized = normalizer.fit_transform(tfidf_data)
#X_svd = svd.fit_transform(X_normalized)
onehot_data[onehot_data != 0] = 1
onehot_data = np.append(onehot_data, list_of_labels.reshape(-1,1), axis=1)
onehot_data_df = pd.DataFrame(onehot_data, columns = [str(i) for i in range(onehot_data.shape[1])])
print(onehot_data_df)

         0    1    2    3    4    5    6    7    8    9  ...  1534  1535  \
0      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
1      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
3      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
4      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   
26846  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
26847  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
26848  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
26849  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
26850  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   

       1536  1537  1538  1539  1540  1541  1542  1543  
0       0.0   0.0   0.0   0.0  

Test the models

In [10]:
#random.seed(42)
train_df, test_df = train_test_split(onehot_data_df, 0.2)
print(type(train_df.columns))

<class 'pandas.core.indexes.base.Index'>


In [34]:
def bootstrapping(train_df, n_bootstrap):
    bootstrap_indices = np.random.randint(low=0, high=len(train_df), size=n_bootstrap)
    df_bootstrapped = train_df.iloc[bootstrap_indices]
    
    return df_bootstrapped

def random_forest_algorithm(train_df, n_trees, n_bootstrap, n_features, dt_max_depth):
    forest = []
    for i in range(n_trees):
        df_bootstrapped = bootstrapping(train_df, n_bootstrap)
        tree = decision_tree_algorithm(df_bootstrapped, max_depth=dt_max_depth, random_subspace=n_features)
        forest.append(tree)
    
    return forest

def random_forest_predictions(test_df, forest):
    df_predictions = {}
    for i in range(len(forest)):
        if (type(forest[i]) is dict):
            column_name = "tree_{}".format(i)
            predictions = decision_tree_predictions(test_df, tree=forest[i])
            df_predictions[column_name] = predictions

    df_predictions = pd.DataFrame(df_predictions)
    random_forest_predictions = df_predictions.mode(axis=1)[0]
    
    return random_forest_predictions

In [32]:
print(decision_tree_algorithm(train_df, counter=0, min_samples=2, max_depth=5, random_subspace=200))

{'1092 = 1.0': [{'1280 = 1.0': [{'1145 = 1.0': [0.0, 1.0]}, {'1047 = 1.0': [0.0, {'461 = 1.0': [0.0, 1.0]}]}]}, {'310 = 1.0': [0.0, {'1145 = 1.0': [0.0, {'711 = 1.0': [0.0, {'1426 = 1.0': [0.0, 1.0]}]}]}]}]}


In [37]:
forest = random_forest_algorithm(train_df, n_trees=10, n_bootstrap=1000, n_features=50, dt_max_depth=10)
predictions = random_forest_predictions(test_df, forest)
accuracy = calculate_accuracy(predictions, test_df.iloc[:,-1])
print(forest[0])
print("Accuracy = {}".format(accuracy))

{'298 = 1.0': [{'756 = 1.0': [0.0, {'1531 = 1.0': [1.0, {'664 = 1.0': [1.0, {'461 = 1.0': [0.0, 1.0]}]}]}]}, {'1379 = 1.0': [{'365 = 1.0': [1.0, {'1456 = 1.0': [1.0, {'1531 = 1.0': [1.0, {'1294 = 1.0': [0.0, 1.0]}]}]}]}, {'1111 = 1.0': [0.0, {'607 = 1.0': [0.0, {'644 = 1.0': [1.0, {'1130 = 1.0': [0.0, {'1117 = 1.0': [0.0, {'925 = 1.0': [0.0, {'467 = 1.0': [0.0, 1.0]}]}]}]}]}]}]}]}]}
Accuracy = 0.6575418994413408


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
RF = RandomForestClassifier(max_depth = 1, random_state=0)
DT = DecisionTreeClassifier()
#_train, X_test, y_train, y_test = train_test_split(TF_IDF_full_data.one, df.iloc[:,-1])

In [None]:
RF.fit(train_data[:,:-1],train_data[:,-1])

In [None]:
#clf.score(X_test, test_label)
print(X_test.shape)
print(X_train.shape)

(5370, 754)
(21481, 1409)


In [None]:
RF.score(test_data[:,:-1], test_data[:,-1])

0.5772811918063314

In [None]:
test = test_data[:,-1]

for i in test:
  if i == 1:
    print(i)

In [None]:
DT.fit(X_train, y_train)
DT.score(X_test, y_test)

0.9365410397735736