<a href="https://colab.research.google.com/github/nezihaksu/Airbnb_Istanbul/blob/main/airbnb_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import re

#Explatory Data Analysis

In [141]:
DF = r'/content/drive/MyDrive/listings.csv'
FILE_TYPE = "csv"
IMPUTE = True
ALLOWED_NAN_PERCENTAGE = 10
DROP_KEYWORDS = ["code","zipcode","link","url","id","name","thumbnail","picture","pic","description","note"]
NONE_VALUES = [np.nan,None,"None","Null","NONE","NULL","none","null","nan",""," ",0]
MULTICOLLINEARITY_PERCENTAGE = 10

class Explore():
  """Explore the dataset."""
  def __init__(self,df,file_type:str):
    if file_type == "xlsx" or  file_type == "xls":
      self.df = pd.read_excel(df,engine="python")
    self.df = pd.read_csv(df,engine="python")
    self.file_type = file_type

  def __call__(self):
    return self.df

  def intro(self):
    return "===INFO===",self.df.info(),"===DESCRIPTION===",self.df.describe(),"===DTYPES==",self.df.dtypes
  
  def unique_values(self):
    #Unique values that are in features.
    for column in self.df.columns:
      print(column.upper()+ " UNIQUE VALUES")
      print(str(df[column].unique())+"\n")

  def missing_values(self):
	  missing_percentage = self.df.isnull().sum()*100/len(self.df)
	  plt.figure(figsize=(5, 15))
	  missing_percentage.plot(kind='barh')
	  plt.xticks(rotation=90, fontsize=10)
	  plt.yticks(fontsize=5)
	  plt.xlabel("Missing Percentage", fontsize=14)
	  plt.show()
   
  #Plotting histograms of the numerical features to see the distribution of each of them.
  def dtype_histogram(self,data_type:str):
    numerical_features = self.df.dtypes[self.df.dtypes == data_type].index.to_list()
    self.df[numerical_features].hist(bins = 50,figsize = (20,15))
    plt.show()

  def corr_heat_map(self):
    pass

In [142]:
explore = Explore(df=DF,file_type=FILE_TYPE)

#Cleaning Dataset

In [143]:
class Cleaner(Explore):
  """Clean the dataset."""
  def __init__(self,df,file_type:str):
    super().__init__(df,file_type)
    if file_type == "xlsx" or  file_type == "xls":
      self.df = pd.read_excel(df,engine="python")

  def __call__(self):
    return self.df

  def _drop_type_column(self,pattern:str,inplace:bool):
    for column in self.df.columns:
      if any(self.df[column].astype(str).str.contains(pattern,regex=True)):
        self.df.drop(column,axis=1,inplace=inplace)
    return self.df

  #Expanding one column dataframe into multiple columns according to split character.
  def split_column_into_df(self,column_index:int,split_char:str):
    if len(df.columns) == 1:
      quotes_strip = list(self.df.columns)[0].replace(strip_char,'')
      columns_split = quotes_strip.split(split_char)
      self.df = self.df[self.df.iloc[:,0].name].str.split(pat = split_char,expand = True)
      self.df.columns =  columns_split
      self.df.replace(split_char,'',regex = True,inplace = True)
    print("This method is only for explanding single column dataframes!")
    return self.df

  def drop_missing_columns(self,percentage):
    self.df.dropna(how="all",axis=1,inplace=True)
    #In case of dropna method does not work as expect because of value type \
    #this loop over columns would solve some of the problems.
    for column in self.df.columns:
      if len(self.df[column].unique()) == 1:
        self.df.drop(column,axis=1,inplace=True)
    missing_percentage = self.df.isnull().sum()*100/len(self.df)
    features_left = missing_percentage[missing_percentage < percentage].index
    self.df = self.df[features_left] 
    return self.df

  #Drop columns by their names.
  def drop_column_contains(self,keywords:list):
    for keyword in keywords:
      keyword_pattern = re.compile(keyword)
      for column in self.df.columns:
        if keyword_pattern.search(column):
          self.df.drop(column,axis=1,inplace=True)
    return self.df

  def drop_sentence_columns(self,inplace):
    #sentence_pattern = r'[A-z][A-z]+?\W'
    sentence_pattern = r'(\w \w){2}'
    link_pattern = r'[A-z][A-z]+?://'
    text_pattern = r'|'.join((sentence_pattern,link_pattern))  
    return self._drop_type_column(text_pattern,inplace)
  
  def drop_date_columns(self,inplace:bool):
    date_pattern_dash = r"([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01]))"
    date_pattern_dot = r"([12]\d{3}.(0[1-9]|1[0-2]).(0[1-9]|[12]\d|3[01]))"
    date_pattern_slash = r"([12]\d{3}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01]))"
    date_pattern_dash_text =  r"([12]\d{3}-([A-z]+)-(0[1-9]|[12]\d|3[01]))"
    date_pattern_dot_text = r"([12]\d{3}.([A-z]+).(0[1-9]|[12]\d|3[01]))"
    date_pattern_slash_text = r"([12]\d{3}/([A-z]+)/(0[1-9]|[12]\d|3[01]))"
    date_pattern = r'|'.join((date_pattern_dash,
                              date_pattern_dot,
                              date_pattern_slash,
                              date_pattern_dash_text,
                              date_pattern_dot_text,
                              date_pattern_slash_text))
    return self._drop_type_column(date_pattern,inplace)

  def drop_special_columns(self,inplace:bool):
    starts_with_special_pattern = r'^[^\w]'
    ends_with_special_pattern = r'[^\w]^'
    starts_ends_special_pattern =  r'|'.join((starts_with_special_pattern,ends_with_special_pattern))
    return self._drop_type_column(starts_ends_special_pattern,inplace)

  
  def strip_signs(self):
    num_pattern = r"[0-9]"
    non_num_pattern = r"[^0-9]"
    for column in self.df.columns:
      if all(self.df[column].astype(str).str.contains(num_pattern,regex=True)):
        self.df[column].replace(non_num_pattern,"",regex=True,inplace=True)
    return self.df

  def imputer(self,strategy="most_frequent"):
    simple_imputer = SimpleImputer(strategy=strategy)
    for column in self.df.columns:
      if pd.DataFrame.any(self.df[column].isnull()):
        self.df[column] = simple_imputer.fit_transform(self.df[column].values.reshape(-1,1))
    print(self.df.describe())
    return self.df      

In [199]:
class Preprocess():

  def __init__(self,df):
    self.df = df
    self.numerical_df,self.categorical_df = self._split_into_cat_num_df()
  
  def __call__(self):
    return self.df
  
  def _split_into_cat_num_df(self):
    num_pattern = r"[0-9]"
    continuous_features = []
    discrete_features = []
    for column in self.df.columns:
      if all(self.df[column].astype(str).str.contains(num_pattern,regex=True)):
        continuous_features.append(column)
      else:
        discrete_features.append(column)
    return self.df[continuous_features],self.df[discrete_features]


  def drop_multicoll_columns(self,allowed_corr_percentage:int):
    corr_matrix = self.numerical_df.corr()
    corr_matrix[corr_matrix]
    #multicoll_indexes = np.where(np.logical_and(corr_matrix < 1.0, corr_matrix > self.corr_percetage))
    return corr_matrix


  def one_hot_encoder(self):
    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='constant')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
      return self.df
  def polytrans(self):
    pass
  

In [145]:
cleaner = Cleaner(DF,FILE_TYPE)

In [None]:
cleaner.drop_column_contains(DROP_KEYWORDS)

In [None]:
cleaner.drop_sentence_columns(inplace=True)

In [None]:
cleaner.drop_date_columns(inplace=True)

In [None]:
cleaner.drop_missing_columns(10)

In [None]:
cleaner.strip_signs()

In [None]:
cleaner.drop_special_columns(True)

In [None]:
df = cleaner.imputer()
df

In [200]:
preprocess = Preprocess(df)

In [201]:
preprocess()

Unnamed: 0,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood_cleansed,market,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,instant_bookable,cancellation_policy,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
0,f,1.0,1.0,Uskudar,Istanbul,Turkey,41.05650,29.05367,f,Apartment,Entire home/apt,2,1.0,0.0,1.0,Real Bed,62400,2,15400,1,730,1,1,730,730,1.0,730.0,24 months ago,30,60,90,365,1,0,f,flexible,f,1,1,0,0
1,f,2.0,2.0,Besiktas,Istanbul,Turkey,41.06984,29.04545,t,Apartment,Entire home/apt,3,1.0,2.0,2.0,Real Bed,11300,4,20800,30,210,30,30,210,210,30.0,210.0,3 months ago,0,0,0,117,41,5,f,moderate,f,2,1,1,0
2,f,1.0,1.0,Besiktas,Istanbul,Turkey,41.07731,29.03891,t,Serviced apartment,Entire home/apt,6,1.0,2.0,5.0,Real Bed,23200,1,000,21,1125,21,21,1125,1125,21.0,1125.0,2 weeks ago,0,0,0,2,0,0,t,strict_14_with_grace_period,f,1,1,0,0
3,f,1.0,1.0,Beyoglu,Istanbul,Turkey,41.03220,28.98216,f,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,26700,2,9900,5,90,5,5,90,90,5.0,90.0,3 months ago,0,8,8,277,2,1,f,moderate,f,1,1,0,0
4,f,19.0,19.0,Sisli,Istanbul,Turkey,41.04471,28.98567,t,Serviced apartment,Entire home/apt,5,1.0,1.0,3.0,Real Bed,65400,2,6600,3,360,3,3,360,360,3.0,360.0,a week ago,12,34,45,302,0,0,t,moderate,f,19,19,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17323,f,1.0,1.0,Bahcelievler,Istanbul,Turkey,40.99535,28.84067,t,House,Private room,4,1.0,1.0,2.0,Real Bed,10700,1,000,3,1125,3,3,1125,1125,3.0,1125.0,today,29,59,89,269,0,0,t,flexible,f,1,0,1,0
17324,f,1.0,1.0,Sisli,Istanbul,Turkey,41.05814,28.99161,f,Apartment,Entire home/apt,3,1.0,2.0,2.0,Real Bed,30300,1,000,3,15,3,3,15,15,3.0,15.0,today,23,29,41,131,0,0,f,flexible,f,1,1,0,0
17325,f,1.0,1.0,Maltepe,Istanbul,Turkey,40.95070,29.12381,f,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,14800,1,000,1,5,1,1,5,5,1.0,5.0,today,0,7,33,33,0,0,f,flexible,f,1,0,1,0
17326,f,1.0,1.0,Besiktas,Istanbul,Turkey,41.07760,29.02340,t,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,14800,1,000,1,7,1,1,7,7,1.0,7.0,today,29,49,79,115,0,0,f,moderate,f,1,0,1,0


In [202]:
preprocess.drop_multicoll_columns(10)

Unnamed: 0,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,guests_included,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
host_listings_count,1.0,1.0,0.002881,-0.025658,0.068656,0.014285,0.010899,0.025701,0.039398,-0.010823,-0.001488,-0.010778,-0.007477,-0.001489,-0.001488,-0.009993,-0.001489,0.020356,0.027357,0.031029,0.082763,0.056718,0.037555,0.391568,0.314488,0.258834,0.036789
host_total_listings_count,1.0,1.0,0.002881,-0.025658,0.068656,0.014285,0.010899,0.025701,0.039398,-0.010823,-0.001488,-0.010778,-0.007477,-0.001489,-0.001488,-0.009993,-0.001489,0.020356,0.027357,0.031029,0.082763,0.056718,0.037555,0.391568,0.314488,0.258834,0.036789
latitude,0.002881,0.002881,1.0,-0.183252,0.070485,0.054859,0.035396,0.037344,0.035571,-0.00156,-0.002801,0.001355,-0.002359,-0.002801,-0.002801,4.1e-05,-0.002801,0.003918,0.004134,0.002703,0.009771,-0.025162,-0.029533,0.022233,0.065574,-0.046133,-0.002026
longitude,-0.025658,-0.025658,-0.183252,1.0,-0.068824,-0.013544,-0.003293,-0.049115,0.003021,-0.006245,-0.002249,-0.008837,-0.006578,-0.002249,-0.002249,-0.008367,-0.002249,-0.034453,-0.032287,-0.028495,-0.037304,-0.002328,0.005187,-0.031269,-0.039825,-0.001346,-0.005715
accommodates,0.068656,0.068656,0.070485,-0.068824,1.0,0.459073,0.556404,0.705799,0.393914,0.003059,0.013935,0.002651,0.003604,0.013935,0.013935,0.003429,0.013935,-0.022845,-0.023806,-0.021967,0.08339,0.087625,0.099631,0.168698,0.274235,-0.065379,0.00404
bathrooms,0.014285,0.014285,0.054859,-0.013544,0.459073,1.0,0.589529,0.567289,0.170643,0.004688,0.007938,0.00512,0.00364,0.007938,0.007938,0.004938,0.007938,0.022006,0.01427,0.010958,0.034041,-0.015942,-0.016315,0.036565,0.055072,-0.007395,-0.007331
bedrooms,0.010899,0.010899,0.035396,-0.003293,0.556404,0.589529,1.0,0.619267,0.219798,0.007603,0.004152,0.007633,0.004991,0.004152,0.004152,0.007128,0.004152,-0.005043,-0.01293,-0.015655,0.01776,0.007853,0.009248,0.026479,0.076783,-0.048588,-0.035584
beds,0.025701,0.025701,0.037344,-0.049115,0.705799,0.567289,0.619267,1.0,0.259591,0.008443,0.020019,0.006295,0.007783,0.020019,0.020019,0.007224,0.020019,0.014394,0.007052,0.003701,0.065916,0.038227,0.035384,0.066118,0.120144,-0.050463,0.062319
guests_included,0.039398,0.039398,0.035571,0.003021,0.393914,0.170643,0.219798,0.259591,1.0,-0.007641,0.004555,-0.007634,-0.006655,0.004555,0.004555,-0.007312,0.004555,-0.061783,-0.060938,-0.057188,0.033559,0.14206,0.148267,0.110959,0.154165,-0.004332,-0.032473
minimum_nights,-0.010823,-0.010823,-0.00156,-0.006245,0.003059,0.004688,0.007603,0.008443,-0.007641,1.0,-0.00056,0.968492,0.949627,-0.00056,-0.00056,0.982037,-0.00056,0.005243,0.004368,0.005086,0.018911,-0.009522,-0.026943,-0.022922,-0.004339,-0.031899,-0.01183
