# Preprocess scrapped and feature engineering Snapp data 
### 1 - Drop canceld travels 
### 2 - Extract destination location from data  
### 3 - Extract origin city (translated in english)
### 4 - Extract date and convert it form jalali (persain date unit into gregorian) 
### 5 - Translate car model in English
### 6 - Convert travel cost from rial to toman and translate it (English)
### 7 - Split Curreny from price 
### 8 - Drop uneeded columns  
### 9 - Save data into csv file  

In [2]:
# for convert Jalali date
from persiantools.jdatetime import JalaliDate

# translate data to english
from deep_translator import GoogleTranslator

# convert num unit
from unidecode import unidecode

# preprocess data 
import pandas as pd

In [3]:
# read datasets
df_sum = pd.read_csv('snapp_summary.csv')
df_history = pd.read_csv('travel_history.csv')

In [3]:
df_history.head(1)

Unnamed: 0,day_title,car,price,date,date_time,origin,destination,star
0,سفر شنبه شب ۲۹ مرداد,پراید,۱۷۰٬۰۰۰ ریال,۲۹ مرداد ۱۴۰۱,۲۹ مرداد ۱۴۰۱,آبادان، امیری، شهید محمد منتظری، بعد از دوم، س...,آبادان، کوی کارگر، بلوار دهداری، f27,5


In [4]:
# check is there any null value 
df_history.isnull().sum()

car                     0
date                    0
star                    0
origin_city             0
price                   0
currency                0
origin_location         0
destination_location    0
dtype: int64

In [5]:
# check is there any null value 

df_sum.isnull().sum()

Travel Hours     0
Travel Counts    0
Travel Km        0
date             0
dtype: int64

## 1 - Drop canceld travels

In [4]:
# if str contains "لغو شده" drop it
df_history = df_history[df_history["day_title"].str.contains("لغو شده")==False]

### 2 - Extract destination location from data

In [5]:
df_history['destination_location'] = df_history['destination'].apply(lambda x : x.split("،")[1] + ',' + x.split("،")[0])

### 3 - Extract origin city (translated in english)

In [7]:
df_history['origin_city'] = df_history['origin'].apply(lambda x : GoogleTranslator(source='fa', target='en').translate(x.split("،")[0]))

In [8]:
df_history['origin_location'] = df_history['origin'].apply(lambda x : x.split("،")[1])

### 4 - Extract date and convert it form jalali (persain date unit into gregorian)

In [9]:
def convert_date(month,day,yaer):
    """
    This funciton convert persain date like "6 1401 خرداد" to gregorian "2022-5-7"
    Params:
    
    month: Jalali month name 
    day: Jalali day date
    yaer: Jalali year date like "1401"
    
    Returns:
    Converted jalali date to gregorian
    
    """
    date_dict = {1  : 'فروردین',
                 2  : 'اردیبهشت',
                 3  : 'خرداد',
                 4  : 'تیر',
                 5  : 'مرداد',
                 6  : 'شهریور',
                 7  : 'مهر',
                 8  : 'آبان',
                 9  : 'آذر',
                 10 : 'دی',
                 11 : 'بهمن',
                 12 : 'اسفند' }
    for key, value in date_dict.items():
        month = month.replace(value,str(key))
    yaer =  unidecode(yaer)
    day =  unidecode(day)
    converted_date =JalaliDate(int(yaer), int(month), int(day)).to_gregorian()
    return converted_date

In [10]:
df_history['date'] = df_history['date'].apply(lambda x : convert_date(x.split(' ')[1],x.split(' ')[0],x.split(' ')[2]))

### 5 - Translate car model in English

In [11]:
df_history['car'] = df_history['car'].apply(lambda x : GoogleTranslator(source='fa', target='en').translate(x))

### 6 - Convert travel cost from rial to toman and translate it (English)

In [12]:
df_history['price'] = df_history['price'].apply(lambda x : GoogleTranslator(source='fa', target='en').translate(x))

In [13]:
df_history['price'] = df_history['price'].apply(lambda x : x.split(' ')[0].replace(",", "")[:-1] + ' ' + 'toman')

### 7 - Split Curreny from price

In [15]:
df_history['currency'] = df_history['price'].apply(lambda x : x.split(' ')[1])
df_history['price'] = df_history['price'].apply(lambda x : int(x.split(' ')[0]))

### 8 - Drop uneeded columns

In [17]:
df_history=df_history[['car','date','star','origin_city','price','currency','origin_location','destination_location']]

### 9 - Save data into csv file

In [19]:
df_history.to_csv('travel_history.csv',index=False)