In [77]:
import pandas as pd
import numpy as np
from __future__ import unicode_literals
from hazm import *
import re
import math
from collections import Counter
from tqdm import tqdm

In [93]:
df = pd.read_csv('Dataset_Divar_EDA.csv')

# normalize

In [94]:
normalizer = hazm.Normalizer()

In [95]:
numbers = list(range(ord("۰"), ord("۹")+1))
numbers.extend(list(range(ord("0"), ord("9")+1)))
chars = list(range(ord("آ"), ord("ی")+20))
chars.extend(list(range(ord("A"), ord("z")+1)))

signs = [",", ".", "?", ";", ":", "(", ")", "$", "%", "!", "\'", "\"", "{", "}", "[", "]", "&", 
           "،", "؛", "«", "»", "؟", "!", " ", "‌","?","@","1"]


In [96]:

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def check_char(text):
    res = ""
    for t in text:
        
            if ord(t) in chars or ord(t) in numbers or t in signs:
                res += t
    return res

In [97]:

def normalize(text):
    while ("\n" in text) or ("\r\n" in text) or ("\n\r" in text):
        text = text.replace("\n", " ").replace("\n\r", " ").replace("\r\n", " ").replace("\u200c"," ")
    text = check_char(text)
    text = normalizer.normalize(text)
    return text

In [98]:
df['description'] = [normalize(d) for d in df['description']]
df['title'] = [normalize(d) for d in df['title']]
df['sub_title'] = [normalize(d) for d in df['sub_title']]


In [99]:
df.to_csv('normalize_divar_dataset.csv')

# tf_idf

In [85]:
import math
from collections import Counter

def calculate_tf_idf(sentence_list):
    # Get word frequency for each sentence
    tf_dict = []
    for sentence in sentence_list:
        word_count = Counter(sentence.split())
        tf_dict.append({word: count/len(word_count) for word, count in word_count.items()})
        
    # Calculate IDF
    idf_dict = {}
    N = len(sentence_list)
    for sentence in sentence_list:
        for word in set(sentence.split()):
            if word not in idf_dict:
                count = sum(1 for s in sentence_list if word in s.split())
                idf_dict[word] = math.log(N/count)
    
    # Calculate TF-IDF
    tf_idf_dict = []
    for tf in tf_dict:
        tf_idf = {}
        for word, freq in tf.items():
            tf_idf[word] = freq * idf_dict.get(word, 0)
        tf_idf_dict.append(tf_idf)
        
    return tf_idf_dict


In [86]:
sentences_title = []
sentences_description = []
sentences_sub_title = []

for i in range(len(df)):
    sentences_title.append(df.iat[i,5])
    sentences_description.append(df.iat[i,6])
    sentences_sub_title.append(df.iat[i,8])
    
    


In [87]:
tf_idf_title = calculate_tf_idf(sentences_title)
tf_idf_description = calculate_tf_idf(sentences_description)
tf_idf_sub_title = calculate_tf_idf(sentences_sub_title)

In [88]:
df.insert(loc=6, column='description_encoding', value=tf_idf_description)
df.insert(loc=5, column='title_encoding', value=tf_idf_title)
df.insert(loc=10, column='sub_title_encoding', value=tf_idf_sub_title)

In [91]:
df.to_csv('tf_idf_embedding_divar.csv')