In [1]:
import pandas as pd
import numpy as np
import abydos
from abydos import distance

In [2]:
def dep_input_output_col(prev_df,now_df,desc):
    new_col = set(now_df.columns) - set(prev_df.columns)
    old_col = set(list(prev_df.columns))
    sort_old_cols = sorted(old_col,key=lambda x:len(x))[::-1]
    sort_new_cols = sorted(new_col,key=lambda x:len(x))[::-1]
    desc = ops["description"]

    import re
    out_cols = []
    in_cols = []
    for x in sort_new_cols:
        sx = r'\b{}\b'.format(x)
        #print(sx,desc)
        xx = re.findall(sx,desc)
        #print(xx)
        for yy in xx:
            desc = desc.replace(yy,"")
        out_cols = out_cols + xx
    for x in sort_old_cols:
        sx = r'\b{}\b'.format(x)
        #print(sx,desc)
        xx = re.findall(sx,desc)
        #print(xx)
        for yy in xx:
            desc = desc.replace(yy,"")
        in_cols = in_cols + xx
    
    out_cols = list(set(out_cols+list(new_col)))
        
    return in_cols,out_cols
    #ops["description"]

#dep_input_output_col(prev_df,now_df,ops["description"])

# General Instruction

- Cleanup dataset based on the information that is given:
You need to clean the dataset according to the information that is given to you. This means that there are problems with the dataset that need to be fixed, and you should use the information given to you to determine what those problems are and how to fix them.

- Each case has different data quality problems, there will be hint and additional information that can help you understand the problem:
Each row in the dataset may have different data quality problems. There will be hints and additional information provided to help you understand what the specific problem is with each row.

- You can do any approach on cleaning the data, but you should clean the instructed column only:
You have the freedom to use any approach to clean the data, but you should only clean the instructed column. This means that you should not modify any other columns in the dataset, or add or remove any rows.

- Do not create new column or remove any column. Also do not create new row, or remove any row:-
You are not allowed to create new columns or remove any columns from the dataset. You are also not allowed to add or remove any rows.

- Each column will have a flag column something equivalent to <column\_name>\_flag. This column can be used to flag the row if you want to not include it to the downstream task. 0: safe_flag, 1: delete_flag, 2: null_flag (if you want to still include the row with null treatment). You can also add a new category but please add justification and explanation of the new category, there are three categories you can use:
safe_flag (0): this row is safe to use in downstream tasks
delete_flag (1): this row should be deleted and not used in downstream tasks
null_flag (2): this row can be included in downstream tasks but with null treatment.
You can also add a new category, but you need to provide a justification and an explanation for the new category. It is worth to note that the completeness of the dataset is also matter, so try not to flag to many things, and do your best to clean the values.

- For each data cleaning task, we have provided a function that represents the goal of the cleaning. For example, clean_duplicate_id(df) is the function for removing duplicate ID values. These functions take a DataFrame as input and return the cleaned version of the DataFrame.

    In each chunk of data cleaning task, you will see the following three parts:

    1. The clean_<name> function that performs the specific cleaning task.
    2. The execution of the cleaning function on the DataFrame.
    3. A checking part to help you evaluate the effectiveness of the cleaning.
    
  While you can create new cells and add additional code, the cleaning must be performed through the provided cleaning functions. You can adjust the order of the cleaning steps, but please try to move the whole chunks of code to avoid any errors.

The cleaning task will be considered complete if this notebook can be run sequentially by executing "restart and runall"




# Purpose
The purpose of this dataset is to conduct exploratory analysis of the listings and create a prediction model for listing price using some columns from the dataset. This means that the dataset is intended to be used to explore the characteristics and features of the listings, and to build a model that can predict the price of a listing based on certain variables in the dataset. The goal is to gain insights into the factors that influence the price of a listing and to develop a model that can accurately predict listing prices based on those factors.

# Columns and Dataset Description
- id: a unique identifier for each listing.
- name: the name or title of the listing, as provided by the host.
- host_id: a unique identifier for each host.
- host_name: the name of the host who listed the property.
- neighbourhood_group: the larger geographic area in which the listing is located (e.g. a borough or group of neighborhoods).
- neighbourhood: the specific neighborhood in which the listing is located.
- latitude: the latitude coordinate of the listing.
- longitude: the longitude coordinate of the listing.
- room_type: the type of space that is being listed (e.g. an entire apartment, a private room, a shared room).
- price: the nightly price of the listing, in the currency specified in the dataset.
- minimum_nights: the minimum number of nights that a guest must book the listing for.
- number_of_reviews: the total number of reviews that the listing has received.
- last_review: the date of the most recent review of the listing.
- reviews_per_month: the average number of reviews per month that the listing has received.
- calculated_host_listings_count: the total number of listings that the host has on Airbnb.
- availability_365: the number of days per year that the listing is available for booking.
- number_of_reviews_ltm: the total number of reviews that the listing has received in the last 12 months.
- license: a license number for the listing, if applicable (this column may not be present in all versions of the dataset).

Besides the columns above, there are columns pre-defined for flagging the rows based on particular data cleaning context:
- id_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the id column (duplicate).
- host_id_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the host_id column.
- neighbourhood_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the neighbourhood column.
- latitude_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the latitude column.
- longitude_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the longitude column.
- minimum_nights_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the minimum_nights column.
- number_of_reviews_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the number_of_reviews column.
- last_review_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the last_review column.
- room_type_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the room_type column.

In [3]:
# transform dcm
import logging 
import pandas as pd

logging.basicConfig(level=logging.DEBUG)
#logging.basicConfig(level=logging.INFO)

class TransformDCM():
    def __init__(self,trace):
        self.trace = trace
        self.source = []
        self.dataset = []
        self.state = []
        self.array = []
        self.column = []
        self.row = []
        self.cell = []
        self.cell_values = []
        self.column_position = []
        self.row_position = []
        self.user = {None: -1}
        
        self.value_derived_from = []        
        self.col_derived_from = []
        self.state_derived_from = []
        self.col_dependency = []
        self.state_detail = []
        
        self.pd_index = None
        
        self.source_id = 0
        self.dataset_id = 0
        self.array_id = 0
        self.col_id = 0
        self.row_id = 0
        self.cell_id = 0
        self.value_id = 0
        self.state_id = -1
        self.col_pos_id = 0
        self.row_pos_id = 0
        self.user_id = 0
        self.execution_id = 0
        self.prev_state_id = -2        
        
        self.col_names_coll = set()
        
        
        self.curr_df = None
        self.curr_col = None
        self.curr_row = None
        self.curr_index = None
        
        self.curr_row_pos = {}
        self.curr_col_pos = {}
        self.curr_col_schema = []
        self.curr_row_list = []
        
        self.curr_state = 0            
        
    
    def render_curr_df(self):
        pass
    
    def render_col(self):
        pass
    
    def render_row(self):
        pass
        
    
    def init_df(self,df):
        self.pd_index = pd.DataFrame(np.empty(df.shape),dtype=object)    
        self.col_names = df.columns
        
        for i,x in enumerate(df.to_records()):
            jj = 0
            #print(i)
            for j,y in enumerate(x):
                #print(j)
                if j==0:
                    continue
                
                self.cell.append((self.cell_id,jj,self.row_id))
                #print(y)
                #self.cell_values.append((self.value_id,self.cell_id,self.state_id,y[0],-1))
                self.cell_values.append((self.value_id,self.cell_id,self.state_id,y,-1))
                #print(self.pd_index.loc[i,jj])
                self.pd_index.loc[i,jj] = (self.cell_id,self.value_id,i,jj) 
                
                self.value_derived_from.append((self.cell_id,self.state_id,-1))

                self.cell_id+=1
                self.value_id+=1
                if i == 0:
                    if jj == 0:
                        prev_j = -1
                    
                    
                    self.column.append((self.col_id,self.array_id))                    
                    self.column_position.append((self.col_pos_id,self.col_id,self.state_id,self.col_names[jj],prev_j,-1))
                    self.col_names_coll.add(self.col_names[jj])
                    self.curr_col_schema.append((self.col_names[jj],jj,self.col_id,prev_j))
                    self.curr_col_pos[self.col_id] = (self.col_pos_id,prev_j)
                    prev_j = jj
                    self.col_pos_id+=1                    
                    self.col_id+=1    
                jj+=1
            if i == 0:
                prev_i = -1
                
            self.row.append((self.row_id,self.array_id))            
            self.row_position.append((self.row_pos_id,self.row_id,self.state_id,prev_i,-1))
            self.curr_row_pos[self.row_id] = (self.row_pos_id,prev_i)
            self.curr_row_list.append(self.row_pos_id)
            prev_i = self.row_id
            self.row_id+=1       
            self.row_pos_id+=1            
        
        """            
        col_id = np.where(df.columns==col.name)[0][0]
        columns.append((col_id,self.array_id))
        #temp_col = []
        for i,x in enumerate(col):
            if not row_processed:
                rows.append((i,array_id))
            #temp_col.append((cell_id,col_id,i))
            cells.append((cell_id,col_id,i))
            cell_values.append((value_id,cell_id,state_id,x,-1))
            pd_index.loc[i,col_id] = (cell_id,value_id,col_id,i)        
            cell_id+=1
            value_id+=1
            #print(i,col_id)
        row_processed = True        
        """
    
    def init_dataset(self,tt):
        logging.debug("init dataset")
        # get filename from trace        
        df = tt[5]
        code = tt[6]
        self.source.append((self.source_id,code,"dataframe"))
        self.dataset.append((self.dataset_id,self.source_id))
        self.array.append((self.array_id,self.dataset_id))
        # state creation
        
        prev_state_id = self.state_id
        self.state_id+=1        
        self.state.append((self.state_id,prev_state_id))            
        
        # generate column, row, cell
        self.init_df(df)
        self.state_id+=1        

        
        #self.state_id+=1
        #self.array_id+=1
        #self.dataset_id+=1
        #self.source_id+=1
        
    def init_dataset_df(self,df,state_ss={'op':"initial"},fname=None):
        logging.debug("init dataset")
        self.source.append((self.source_id,fname,"dataframe"))
        self.dataset.append((self.dataset_id,self.source_id))
        self.array.append((self.array_id,self.dataset_id))
        # state creation
                
        # generate column, row, cell
        self.init_df(df)       


        self.state.append((self.state_id,self.prev_state_id))
        #self.state_detail.append((prev_state_id,state_ss))
        self.state_detail.append((self.state_id,state_ss,self.execution_id,self.prev_state_id,None))      
        
        self.prev_state_id = self.state_id
        self.state_id+=1

    
    def change_column_schema(self,prev_df,now_df):
        #old_col = list(self.curr_col)  
        old_col = list(prev_df.columns)
        now_col = list(now_df.columns)
        #logging.debug(now_df)
        #logging.debug(("old new col",old_col,self.curr_col_schema,now_col))
        new_col = set(now_col)-set(old_col)
        ocol = set(old_col) - set(now_col)
        
        logging.debug(("change_col_schema old_col",old_col,self.curr_col_schema,now_col))
        logging.debug(("change_col_schema new_col",new_col,ocol))
        
        old_schema = [x[0] for x in self.curr_col_schema]
        
        temp_new_col = []
        
        temp_prev = None
        
        logging.debug(("change_col_schema old_schema:",old_schema))
                
        for n_idx,x in enumerate(now_col):
            logging.debug("change_col_schema now_col:")
            try:
                idx_schema = old_schema.index(x)
            except:
                # check the potential column
                for y in ocol:                    
                    test = self.curr_df.loc[:,y].fillna(0) == now_df.loc[:,x].fillna(0)
                    logging.debug(("test",test.sum(),self.curr_df.shape[0],self.curr_df,now_df))
                    if test.sum() == self.curr_df.shape[0]:
                        idx = old_col.index(y)
                        prev_idx = (-1,None)
                        logging.debug(("idx_test",idx))
                        if idx > 0:
                            prev_idx = old_col[idx-1]
                            idx_n = now_col.index(x)
                            now_idx = self.curr_col_schema[idx_n]
                            prev_col_schema = (None,None,-1,None)
                            if idx_n > 0:
                                prev_col_schema = self.curr_col_schema[idx_n-1]
                            logging.debug(("change_col_schema now_idx:",now_idx,prev_idx))
                            self.column_position.append((self.col_pos_id,now_idx[-2],self.state_id,x,prev_col_schema[2],now_idx[1]))
                            logging.debug(("change_col_schema adding_col_pos1:",(self.col_pos_id,now_idx[-2],self.state_id,x,n_idx-1,now_idx[2])))
                            temp_new_col.append((x,self.col_pos_id,now_idx[2],now_idx[3]))
                            #temp_new_col.append((x,self.col_pos_id,now_idx[2]))
                            #temp_new_col.append((x,self.col_pos_id,-1))
                            self.col_pos_id+=1  
                            
                            logging.debug(("change_col_schema now_idx",now_idx))
                            self.col_dependency.append((self.state_id,now_idx[-2] if len(now_idx)==4 else now_idx[-1],now_idx[-2] if len(now_idx)==4 else now_idx[-1]))                                                                      
                            break
                continue
                                        
            old_schema_idx = self.curr_col_schema[idx_schema]
            idx = idx_schema
            #idx = old_col.index(x)
            #prev_idx = (-1,None)
            prev_idx = -1
            prev_old_schema = (None,None,None,None)
            if idx > 0:
                prev_idx = idx - 1
                prev_old_schema = self.curr_col_schema[prev_idx]            

            prev_nidx = -1
            prev_new_schema = (None,None,None,None)
            if n_idx > 0:     
                prev_nidx = n_idx - 1
                prev_new_schema = temp_new_col[-1]
                
            #logging.debug(("change_col_schema old",n_idx,x,idx,prev_idx,self.curr_col_schema[idx_schema]))
            #logging.debug(("change_col_schema cur_col_schema:",self.curr_col_schema[n_idx][0],x))
            logging.debug(("change_col_schema prev next:",prev_old_schema,prev_new_schema))
            
            #if self.curr_col_schema[n_idx][0] != x:
            if prev_old_schema[2] != prev_new_schema[2]:
                #if prev_idx[1] != prev_nidx[1]: 
                #if prev_idx[0] != prev_nidx[1]:         
                #logging.debug(("tempnewcol:",temp_new_col[n_idx-1], now_idx))
                #if temp_new_col[n_idx-1][2] != now_idx[2]:
                """
                if temp_new_col[n_idx-1][2] != now_idx[2]:
                    idx_n = now_col.index(x)
                    self.column_position.append((self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2]))
                    logging.debug(("adding_col_pos2:",(self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2])))                            
                    #temp_new_col.append((x,self.col_pos_id,now_idx[1],now_idx[2],temp_prev))
                    temp_new_col.append((x,self.col_pos_id,now_idx[2],temp_prev))
                    self.col_pos_id+=1
                """
                self.column_position.append((self.col_pos_id,old_schema_idx[1],self.state_id,x,prev_new_schema[1],old_schema_idx[1]))
                #logging.debug(("adding_col_pos2:",(self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2])))                            
                #temp_new_col.append((x,self.col_pos_id,now_idx[1],now_idx[2],temp_prev))
                temp_new_col.append((x,self.col_pos_id,old_schema_idx[1],prev_new_schema[1]))
                self.col_pos_id+=1
                #aaa
            else:
                 #temp_new_col.append((x,now_idx[1],now_idx[2],now_idx[3]))
                temp_new_col.append(self.curr_col_schema[idx_schema])
        
        
        logging.debug(("change_col_schema temp_new_col:",temp_new_col))
        
        self.curr_col_schema = temp_new_col
                
        return None
        
        for n_idx,x in enumerate(now_col):
            try:
                idx_schema = old_schema.index(x)
            except:
                # check the potential column
                for y in ocol:                    
                    test = self.curr_df.loc[:,y].fillna(0) == now_df.loc[:,x].fillna(0)
                    logging.debug(("test",test.sum(),self.curr_df.shape[0],self.curr_df,now_df))
                    if test.sum() == self.curr_df.shape[0]:
                        idx = old_col.index(y)
                        prev_idx = (-1,None)
                        logging.debug(("idx_test",idx))
                        if idx > 0:
                            prev_idx = old_col[idx-1]
                            idx_n = now_col.index(x)
                            now_idx = self.curr_col_schema[idx_n]  
                            logging.debug(("now_idx:",now_idx,prev_idx))
                            self.column_position.append((self.col_pos_id,now_idx[-2],self.state_id,x,n_idx-1,now_idx[2]))
                            logging.debug(("adding_col_pos1:",(self.col_pos_id,now_idx[-2],self.state_id,x,n_idx-1,now_idx[2])))
                            temp_new_col.append((x,self.col_pos_id,now_idx[2],now_idx[3]))
                            #temp_new_col.append((x,self.col_pos_id,now_idx[2]))
                            #temp_new_col.append((x,self.col_pos_id,-1))
                            self.col_pos_id+=1  
                            
                            logging.debug(("now_idx",now_idx))
                            self.col_dependency.append((self.state_id,now_idx[-2] if len(now_idx)==4 else now_idx[-1],now_idx[-2] if len(now_idx)==4 else now_idx[-1]))                              
                continue
            
            old_schema_idx = self.curr_col_schema[idx_schema]
            
            idx = old_col.index(x)
            prev_idx = (-1,None)
            next_idx = None
            if idx > 0:
                prev_idx = old_col[idx-1]
            #if idx < len(old_col)-1:
            #    next_idx = old_col[idx+1]

            logging.debug(("change_schema",n_idx,x,idx,prev_idx,self.curr_col_schema[idx_schema]))
                        
            now_idx = self.curr_col_schema[idx_schema]    
            
            prev_nidx = (-1,None)
            if n_idx > 0:                
                prev_nidx = (n_idx-1,now_col[n_idx-1])                        
                prev_idx = self.curr_col_schema[n_idx-1]
            
            #logging.debug((self.curr_col_pos[old_schema_idx[2]],now_idx))
            
            
            if temp_prev == None:
                temp_prev = -1
            else:
                temp_prev = now_idx[-1]
                
            #if self.curr_col_pos[old_schema_idx[2]][1] != now_idx[3]:
            #print(now_idx,temp_new_col[n_idx-1])
            logging.debug(("prev_idx,nidx:",prev_idx,prev_nidx,temp_new_col,n_idx))
            if prev_idx[1] != prev_nidx[1]:         
                #if prev_idx[0] != prev_nidx[1]:         
                logging.debug(("tempnewcol:",temp_new_col[n_idx-1], now_idx))
                #if temp_new_col[n_idx-1][2] != now_idx[2]:
                if temp_new_col[n_idx-1][2] != now_idx[2]:
                    idx_n = now_col.index(x)
                    self.column_position.append((self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2]))
                    logging.debug(("adding_col_pos2:",(self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2])))                            
                    #temp_new_col.append((x,self.col_pos_id,now_idx[1],now_idx[2],temp_prev))
                    temp_new_col.append((x,self.col_pos_id,now_idx[2],temp_prev))
                    self.col_pos_id+=1
            else:
                 temp_new_col.append((x,now_idx[1],now_idx[2],now_idx[3]))
            
                                
            """
            #if idx_n > 0:
            prev_idx_n = -1
            try:
                prev_idx_n = now_col[idx_n-1]
            except:
                pass
            logging.debug(("test column",prev_idx,prev_idx_n))

            if prev_idx is not None and prev_idx[0]!=prev_idx_n:
                self.column_position.append((self.col_pos_id,prev_idx[0],self.state_id,x,prev_idx_n,prev_idx[1]))
                self.col_pos_id+=1        
            #self.curr_col_schema.insert(idx,(x,self.col_id))            
            self.col_names_coll.add(x)
            """
            
        self.curr_col_schema = temp_new_col
            
        pass
    
    def change_row_position(self,prev_df,now_df):
        old_row = list(self.curr_row)
        now_row = list(now_df.index)
        #new_row = set(now_row)-set(old_row)
        
        temp_cur_row_pos = []
        for x in now_row:
            #logging.debug((x))
            temp_cur_row_pos.append(self.curr_row_pos[x])            
        
        logging.debug(temp_cur_row_pos)
        pass
    
    def add_column(self,prev_df,df,state_ss=None):   
        #old_col = self.curr_col
        old_col = prev_df.columns
        now_col = df.columns
        new_col = set(now_col)-set(old_col)
        logging.debug(("new_col:",new_col,self.curr_col_schema))
        
        # sort new_col by the index
        new_col = list(filter(lambda x:x in new_col,now_col))

        in_col,out_col = dep_input_output_col(prev_df,df,state_ss["operation"]["description"])
        #print("in_col,out_col",in_col,out_col)
        
        old_col_schema = self.curr_col_schema.copy()
        
        for j,x in enumerate(new_col):
            last_col = self.curr_col_schema[-1][1]
            new_col_val = df.loc[:,[x]]
            logging.debug(new_col_val.values.tolist())
            idx = list(now_col).index(x)
                
            logging.debug(idx)
            if idx > 0:
                prev_idx = self.curr_col_schema[idx-1]
            
            self.column_position.append((self.col_pos_id,self.col_id,self.state_id,x,prev_idx[2],-1))       
            logging.debug(("curr_col_schema",self.curr_col_schema))
            #self.curr_col_schema.insert(idx,(x,self.col_pos_id,self.col_id,self.curr_col_schema[-1][3] if len(self.curr_col_schema[-1])==4 else self.curr_col_schema[-1][-1]))
            self.curr_col_schema.insert(idx,(x,self.col_pos_id,self.col_id,self.curr_col_schema[-1][2]))
            self.curr_col_pos[self.col_id] = (self.col_pos_id,last_col)
            self.col_names_coll.add(x)            
            
            # add values
            self.column.append((self.col_id,self.array_id))
            #self.cell_values.append()
            temp_idx = []
            for y,i in zip(new_col_val.values.tolist(),self.curr_row):
            #for y,i in zip(new_col_val.values.tolist(),self.curr_row):
                self.cell.append((self.cell_id,self.col_id,i))
                self.cell_values.append((self.value_id,self.cell_id,self.state_id,y[0][0] if y[0]!=None else y[0],-1))
                #print(self.pd_index.loc[i,jj])
                #self.pd_index.loc[i,jj] = (self.cell_id,self.value_id,i,jj) 
                temp_idx.append([(self.cell_id,self.value_id,i,self.col_id)])
                
                # add linkage to the derived by cell
                # lookup value id for input col_value
                logging.debug(("in_col:",in_col,self.curr_col_schema))
                for z in in_col:                    
                    zzl = [x[-2] if len(x)==4 else x[-1] for x in list(filter(lambda x:(x[0]==z),self.curr_col_schema))]
                    logging.debug(("zzl:",zzl))
                    for zz in zzl:                        
                        op_cid = list(filter(lambda x:(x[1]==zz)&(x[2]==i),self.cell))
                        logging.debug(("op_cid:",op_cid))
                        for zzz in op_cid:
                            op_val = list(filter(lambda x:x[1]==zzz[0],sorted(self.cell_values,key=lambda x:x[0])))[::-1]
                            logging.debug(("op_val:",op_val))
                            for v in op_val:
                                self.value_derived_from.append((self.value_id,self.state_id,v[0]))
                                break
                                            
                #filter(lambda x:,self.cell_values)
                
                self.value_id+=1
                self.cell_id+=1
            
            #self.pd_index[self.col_id] = pd.DataFrame(temp_idx)
            self.pd_index.insert(loc=idx,column=self.col_id,value=pd.DataFrame(temp_idx)[0].tolist())
            
            #self.curr_col_pos[self.col_id] = (self.col_pos_id,prev_j)

            for z in in_col:
                zzl = [x[-2] for x in list(filter(lambda x:(x[0]==z),self.curr_col_schema))]
                for zz in zzl:                        
                    self.col_dependency.append((self.state_id,self.col_id,zz))                                            
            
            #prev_j = j
            self.col_pos_id+=1                    
            self.col_id+=1
        
        # normalize column position
        temp_new_col = []
        for n_idx,x in enumerate(self.curr_col_schema):
            filtered_col = list(filter(lambda y: y[2] == x[2],old_col_schema))
            if len(filtered_col)>0:
                idx = [y[2] for y in old_col_schema].index(x[2])
                prev_idx = -1
                prev_old_schema = (None,None,None,None)
                if idx > 0:
                    prev_idx = idx - 1
                    prev_old_schema = old_col_schema[prev_idx]
                old_schema_idx = old_col_schema[idx]
                
                prev_nidx = -1
                prev_new_schema = (None,None,None,None)
                if n_idx > 0:     
                    prev_nidx = n_idx - 1
                    prev_new_schema = self.curr_col_schema[prev_nidx]

                if prev_old_schema[2] != prev_new_schema[2]:                
                    self.column_position.append((self.col_pos_id,old_schema_idx[1],self.state_id,x[0],prev_new_schema[1],old_schema_idx[1]))
                    
                    temp_new_col.append((x[0],self.col_pos_id,old_schema_idx[1],prev_new_schema[1]))
                    self.col_pos_id+=1                
                else:
                    temp_new_col.append(old_col_schema[idx])
            else:
                temp_new_col.append(self.curr_col_schema[n_idx])
        
        logging.debug(("add column",temp_new_col))
        
        self.curr_col_schema = temp_new_col
                        
        #len(now_col)>len(prev_col)
        pass
    
    def remove_column(self,prev_df,df):
        #old_col = self.curr_col
        old_col = prev_df.columns
        curr_col = self.curr_col_schema
        now_col = df.columns
        
        removed = set(old_col) - set(now_col)
        logging.debug(("remove_column: ",removed))
        for x in removed:
            idx = list(old_col).index(x)

            #logging.debug(idx)
            prev_idx = None
            next_idx = None
            if idx > 0:
                prev_idx = self.curr_col_schema[idx-1]
            if idx < len(old_col)-1:
                next_idx = self.curr_col_schema[idx+1]
            
            self.column_position.append((self.col_pos_id,self.curr_col_schema[idx][1],self.state_id,x,-2,self.curr_col_schema[idx][1]))
            self.curr_col_pos[idx] = (self.col_pos_id,-2)
            old_col = self.curr_col_schema.pop(idx)
            
            
            self.col_pos_id+=1          
            if next_idx is not None:
                self.column_position.append((self.col_pos_id,next_idx[1],self.state_id,x,prev_idx[1],next_idx[0]))                
                self.col_pos_id+=1                                        
        pass
    
    def add_row(self,df):
        old_row = list(self.curr_row)
        now_row = list(df.index)
        new_row = set(now_row)-set(old_row)           
        logging.debug((old_row,now_row,new_row))
        
        
        for x in new_row:
            new_row_val = df.loc[[x],:]
            logging.debug(("new row",new_row_val.values.tolist(),self.curr_col_schema))
        
            for y,i in zip(new_row_val.values.tolist()[0],self.curr_col_schema):
                #print(i)
                self.cell.append((self.cell_id,i[1],self.row_id))
                self.cell_values.append((self.value_id,self.cell_id,self.state_id,y[0],-1))
                self.cell_id+=1
                self.value_id+=1
            
            self.row.append((self.row_id,self.array_id))            
            
            if self.curr_row_pos[self.row_id-1][1]!=-2:
                self.curr_row_pos[self.row_id] = (self.row_pos_id,self.row_id-1)
                self.row_position.append((self.row_pos_id,self.row_id,self.state_id,self.row_id-1,-1))
            else:
                temp_prev_row = list(filter(lambda x:(x[1]==self.row_id-1)&(x[3]!=-2),self.row_position))
                if len(temp_prev_row)>0:
                    self.curr_row_pos[self.row_id] = (self.row_pos_id,temp_prev_row[-1][3])
                    self.row_position.append((self.row_pos_id,self.row_id,temp_prev_row[-1][3],-1))

                else: 
                    self.curr_row_pos[self.row_id] = (self.row_pos_id,self.row_id-1)
            
            self.row_id+=1    
            self.row_pos_id+=1
                
            """
            idx = list(now_col).index(x)
                
            logging.debug(idx)
            if idx > 0:
                prev_idx = self.curr_col_schema[idx-1]
            
            self.column_position.append((self.col_pos_id,self.col_id,self.state_id,x,prev_idx[1],-1))
            
            self.curr_col_schema.insert(idx,(x,self.col_id))
            
            self.col_names_coll.add(x)
            
            
            # add values
            self.column.append((self.col_id,self.array_id))
            #self.cell_values.append()
            temp_idx = []
            for y,i in zip(new_col_val.values.tolist(),self.curr_row):
                self.cell.append((self.cell_id,self.col_id,i))
                self.cell_values.append((self.value_id,self.cell_id,self.state_id,y[0],-1))
                #print(self.pd_index.loc[i,jj])
                #self.pd_index.loc[i,jj] = (self.cell_id,self.value_id,i,jj) 
                temp_idx.append([(self.cell_id,self.value_id,i,self.col_id)])
                self.value_id+=1
                self.cell_id+=1
            
            self.pd_index[self.col_id] = pd.DataFrame(temp_idx)

            #prev_j = j
            self.col_pos_id+=1                    
            self.col_id+=1             
            """
        
        
        pass

    def remove_row(self,df):
        old_row = list(self.curr_row)
        now_row = list(df.index)
        removed_row = set(old_row)-set(now_row)           
        logging.debug((old_row,now_row,removed_row))
        
        
        for x in removed_row:
            temp_row_pos = self.curr_row_pos[x]

            self.row_position.append((self.row_pos_id,x,self.state_id,-2,temp_row_pos[0]))
            self.curr_row_pos[x] = (self.row_pos_id,-2)
            self.row_pos_id+=1
            # next pos

            # filter with previous row_pos_id
            next_row = list(filter(lambda x:x[1][1]==x,self.curr_row_pos.items()))
            if len(next_row)>0:
                next_row = next_row[0]
                self.row_position.append((self.row_pos_id,next_row[0],self.state_id,temp_row_pos[1],next_row[1][0]))
                self.curr_row_pos[next_row[0]] = (self.row_pos_id,temp_row_pos[1])

                self.row_pos_id+=1        
        pass
    
    def change_values(self,df,change):
        #self.curr_index = self.pd_index[change]
        
        #change.columns = df.columns
        #change.index = df.index                
        
        #l_idx = self.pd_index[change]
                
        tt = np.where(np.matrix(change.to_numpy())==True)
        #print(tt)
        tt = list(zip(tt[0],tt[1]))
        #logging.debug((list(zip(tt[0],tt[1]))))
        #if tt[0].shape[0] == 1:
        #    tt = [(tt[0][0],tt[1][0])]
        #logging.debug((np.where(np.matrix(change.to_numpy())==True)))
        #logging.debug(("tt",tt))
        logging.debug(("change_values pd_index:",self.pd_index,change.to_numpy()))
        idx = self.pd_index.to_numpy()[change.to_numpy()].flatten()        
        val = df.fillna("").to_numpy()[change.to_numpy()].flatten()
        logging.debug(("idx:",idx,val))
        #list(idx)
        idx_list = list(filter(lambda x:pd.isna(x)!=True,idx))
        val_list = list(filter(lambda x:pd.isna(x)!=True,val))
        
        logging.debug(("change_values idxlist:",idx_list,val_list))
        
        #?filter
        
        #print(idx_list,val_list)
        
        set_temp_col = set()

        for x in zip(idx_list,val_list,tt):
            logging.debug(("change_values idxlist:",x))
            self.cell_values.append((self.value_id,x[0][0],self.state_id,x[1],x[0][1]))   
            #logging.debug(("x2",x[2]))
            ttx = list(self.pd_index.loc[x[2][0],x[2][1]])
            ttx[1] = self.value_id
            #logging.debug((ttx))
            self.pd_index.loc[x[2][0],x[2][1]] = tuple(ttx)
            #x[0] = self.value_id            
            
            
            self.value_derived_from.append((self.value_id,self.state_id,x[0][1]))
                        
            temp_col = list(filter(lambda y: y[0]==x[0][0],self.cell))[0][1]   
            if temp_col not in set_temp_col:
                self.col_dependency.append((self.state_id,temp_col,temp_col))  
                set_temp_col.add(temp_col)
                #aaa
            
            """            
            logging.debug(("in_col",in_col))
            for z in in_col:
                zzl = [x[-1] for x in list(filter(lambda x:(x[0]==z),self.curr_col_schema))]
                logging.debug(("zzl",zzl))
                for zz in zzl:                        
                    op_cid = list(filter(lambda x:(x[1]==zz)&(x[2]==i),self.cell))
                    logging.debug(("op_cid",op_cid))
                    for zzz in op_cid:
                        op_val = list(filter(lambda x:x[1]==zzz[0],self.cell_values))
                        logging.debug(("op_val",op_val))
                        for v in op_val:
                            self.value_derived_from.append((self.value_id,v[0]))
            """
            
            self.value_id+=1
            #print(x)

        #prev_state_id = state_id
        #state_id+=1
        #state.append((state_id,prev_state_id))
        
        pass
                    
    
    def change_df(self,prev_df,now_df,state_ss,user=None,prev_ss=None):  
        try:
            in_col,out_col = dep_input_output_col(prev_df,now_df,state_ss["operation"]["description"])
        
            # add linkage to the derived by cell
            # lookup value id for input col_value
            if len(out_col)==0:
                logging.debug(("in_col:",in_col,self.curr_col_schema))
                for z in in_col:
                    zzl = [x[-2] if len(x)==4 else x[-1] for x in list(filter(lambda x:(x[0]==z),self.curr_col_schema))]
                    logging.debug(("zzl",zzl))
                    for zzy in zzl:
                        self.col_dependency.append((self.state_id,zzy,zzy))                                
        except:
            pass
        
        #prev_col = np.array(self.curr_col)
        prev_col = prev_df.columns
        now_col = now_df.columns
        
        #print(prev_col,now_col)
        
        
        # condition for add and remove columns
        if len(now_col)>len(prev_col):
            # condition for add_columns:
            logging.debug("add column")
            self.add_column(prev_df,now_df,state_ss)            
        elif len(prev_col)>len(now_col):
            # condition for remove_columns:
            logging.debug("remove_column")
            self.remove_column(prev_df,now_df)
            
        # condition for change of schema:
        if len(prev_col)==len(now_col) and np.sum(prev_col!=now_col)>0:
            logging.debug("change column schema")
            self.change_column_schema(prev_df,now_df)
        
        #prev_row = np.array(self.curr_row)
        prev_row = np.array(list(prev_df.index))
        now_row = np.array(list(now_df.index))
        
        # condition for add and remove rows        
        if len(now_row)>len(prev_row):
            # condition for add_columns:
            logging.debug("add row")
            self.add_row(now_df)            
        elif len(prev_row)>len(now_row):
            # condition for remove_columns:
            logging.debug("remove row")
            self.remove_row(now_df)
            
        # condition for change of schema:
        if len(prev_row)==len(now_row) and np.sum(prev_row!=now_row)>1:
            logging.debug("change row position")
            self.change_row_position(prev_df,now_df)
            
        
        # condition for change of values
        try:
            change_val = now_df.fillna("")!=prev_df.fillna("")
            if change_val.to_numpy().sum()>0:
                logging.debug("change values")
                self.change_values(now_df,change_val)
            else:
                logging.debug("nothing change")
        except BaseException as ex:
            logging.debug(ex)
            pass
        
                
        #prev_state_id = self.state_id
        #prev_state_id = self.state_id
        #self.state_id+=1   
        try:
            self.user[user]
        except:
            self.user[user]=self.user_id
            self.user_id+=1
        
        if prev_ss == None:
            self.state_detail.append((self.state_id,state_ss,self.execution_id,self.prev_state_id,self.user[user]))       
            self.state.append((self.state_id,self.prev_state_id))    
            self.prev_state_id = self.state_id
            self.state_id+=1   
        else:             
            self.state_detail.append((self.state_id,state_ss,self.execution_id,prev_ss,self.user[user]))       
            self.state.append((self.state_id,prev_ss))       
            self.prev_state_id = self.state_id
            self.state_id+=1
            
        self.execution_id+=1
        
        return True
        
        """
        
        col_id = np.where(prev_df.columns==col.name)[0][0]
        columns.append((col_id,array_id))
        #temp_col = []
        for i,x in enumerate(col):
            if not row_processed:
                rows.append((i,array_id))
            #temp_col.append((cell_id,col_id,i))
            cells.append((cell_id,col_id,i))
            cell_values.append((value_id,cell_id,state_id,x,-1))
            cell_id+=1
            value_id+=1
            #print(i,col_id)
            pd_index.loc[i,col_id] = (cell_id,col_id,i)            
        row_processed = True
        state_id+=1
        """
        
    def transform(self):
        prev_df = None
        now_df = None
        for i,x in enumerate(self.trace):
            now_df = x[5]
            #print(now_df)
            if i == 0:
                #init dataset
                self.init_dataset(x)
            else:
                self.change_df(prev_df,now_df)
                pass
            
            
            self.curr_col = now_df.columns
            #self.curr_col_schema = [for x in self.curr_col]
            self.curr_row = list(now_df.index)        
            self.curr_df = now_df                        
            prev_df = now_df
        
        

# Load Data

In [4]:
original_pd = pd.read_csv("../case_1/chicago_vert_dataset.csv",dtype=str).append(pd.read_csv("../case_2/chicago_vert_dataset.csv",dtype=str))
original_pd = original_pd.fillna("").reset_index(drop=True)
for x in list(filter(lambda x:x.endswith("flag"),original_pd.columns)):
    original_pd.loc[:,x] = original_pd.loc[:,x].astype(int) 
original_pd_before = original_pd.copy()

  original_pd = pd.read_csv("../case_1/chicago_vert_dataset.csv",dtype=str).append(pd.read_csv("../case_2/chicago_vert_dataset.csv",dtype=str))
  original_pd.loc[:,x] = original_pd.loc[:,x].astype(int)


In [5]:
super_ori_pd = pd.read_csv("../../../collaboration_simulation/airbnb_test_case/chicago_listings.csv",dtype=str)
super_ori_pd = super_ori_pd.fillna("")
for x in list(filter(lambda x:x.endswith("flag"),original_pd.columns)):
    super_ori_pd.loc[:,x] = 0
#super_ori_pd = super_ori_pd.loc[super_ori_pd.id.isin(original_pd.id),original_pd.columns]
#super_ori_pd
super_ori_pd = original_pd.merge(super_ori_pd,left_on="id",right_on="id",suffixes=("_x",""))
super_ori_pd = super_ori_pd.loc[:,original_pd.columns].reset_index(drop=True)

In [6]:
#dcm1.pd_index.iloc[134,6]

In [7]:
change_val = original_pd.fillna("null")!=super_ori_pd.fillna("null")
original_pd[change_val]

#idx = dcm1.pd_index.to_numpy()[change_val.to_numpy()].flatten()        
#val = original_pd.fillna("").to_numpy()[change_val.to_numpy()].flatten()
#idx_list = list(filter(lambda x:pd.isna(x)!=True,idx))
#val_list = list(filter(lambda x:pd.isna(x)!=True,val))

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,Entire home,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196,,,,,,,,,,,...,,,,,,,,,,
3197,,,,,,,,,Apartment,,...,,,,,,,,,,
3198,,,,,,,,,,,...,,,,,,,,,,
3199,,,,,,,,,Apartment,,...,,,,,,,,,,


In [8]:
#len(idx_list),len(val_list)

In [10]:
#list(filter(lambda x:x[0]==3624,idx_list))

In [11]:
#idx_list.index((3624, 86509, 134, 6))

In [12]:
#val_list[82]

In [13]:
#val_list,idx_list

In [14]:
#super_ori_pd.iloc[134,6]

In [15]:
#original_pd.iloc[134,6]

In [16]:
super_ori_pd.describe()

Unnamed: 0,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
count,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
original_pd

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
0,25879,2/1 One Block to Fullerton L Red Line Deck & ...,101521,Red,,Lincoln Park,41.92693,-87.65753,Entire home/apt,94,...,City registration pending,0,0,0,0,0,0,0,0,0
1,37738,Andersonville - Perfect location!,162364,Mat And Randy,,Uptown,41.9729,-87.66538,Private room,110,...,R20000059426,0,0,0,0,0,0,0,0,0
2,189821,"Best in Chicago, private, amazing garden space",899757,Meighan,,Logan Square,41.92918,-87.70219,Entire home/apt,236,...,R21000062936,0,0,0,0,0,0,0,0,0
3,207218,Historic Pullman Artist Flat - Artists & Explo...,1019125,Jb,,Pullman,41.6883,-87.60892,Entire home,100,...,R21000073121,0,0,0,0,0,0,0,0,0
4,220333,Pullman School House Apartment - monthly rental,1019125,Jb,,Pullman,41.68815,-87.60918,Entire home/apt,100,...,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196,782628636832878491,"Steps to Shop, Eat, Train | Easy Access | Zencity",47172572,Zencity,,West Town,41.8955,-87.66124,Entire home/apt,50,...,R19000043484,0,0,0,0,0,0,0,0,0
3197,782643895516370805,Old Town Oasis,169297663,William,,Near North Side,41.90105,-87.63716,Apartment,120,...,R22000093645,0,0,0,0,0,0,0,0,0
3198,784994899201350568,Lovely 1 bed Apt in River North,52827024,Yakir,,Near West Side,41.88822006416301,-87.64145321718578,Entire home/apt,84,...,,0,0,0,0,0,0,0,0,0
3199,785423932330914663,"River North 1br w/ gym, pool & roof, nr Riverwalk",107434423,Blueground,,Near North Side,41.890516,-87.635955,Apartment,169,...,,0,0,0,0,0,0,0,0,0


In [18]:
original_pd.shape,super_ori_pd.shape

((3201, 27), (3201, 27))

In [19]:
import numpy as np
import json
import pandas as pd

dcm1 = TransformDCM(None)

In [20]:
dcm1.init_dataset_df(super_ori_pd)

DEBUG:root:init dataset


In [21]:
dcm1.curr_col = super_ori_pd.columns
dcm1.curr_row = list(super_ori_pd.index) 
dcm1.curr_df = super_ori_pd

In [22]:
original_pd[super_ori_pd.columns]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
0,25879,2/1 One Block to Fullerton L Red Line Deck & ...,101521,Red,,Lincoln Park,41.92693,-87.65753,Entire home/apt,94,...,City registration pending,0,0,0,0,0,0,0,0,0
1,37738,Andersonville - Perfect location!,162364,Mat And Randy,,Uptown,41.9729,-87.66538,Private room,110,...,R20000059426,0,0,0,0,0,0,0,0,0
2,189821,"Best in Chicago, private, amazing garden space",899757,Meighan,,Logan Square,41.92918,-87.70219,Entire home/apt,236,...,R21000062936,0,0,0,0,0,0,0,0,0
3,207218,Historic Pullman Artist Flat - Artists & Explo...,1019125,Jb,,Pullman,41.6883,-87.60892,Entire home,100,...,R21000073121,0,0,0,0,0,0,0,0,0
4,220333,Pullman School House Apartment - monthly rental,1019125,Jb,,Pullman,41.68815,-87.60918,Entire home/apt,100,...,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196,782628636832878491,"Steps to Shop, Eat, Train | Easy Access | Zencity",47172572,Zencity,,West Town,41.8955,-87.66124,Entire home/apt,50,...,R19000043484,0,0,0,0,0,0,0,0,0
3197,782643895516370805,Old Town Oasis,169297663,William,,Near North Side,41.90105,-87.63716,Apartment,120,...,R22000093645,0,0,0,0,0,0,0,0,0
3198,784994899201350568,Lovely 1 bed Apt in River North,52827024,Yakir,,Near West Side,41.88822006416301,-87.64145321718578,Entire home/apt,84,...,,0,0,0,0,0,0,0,0,0
3199,785423932330914663,"River North 1br w/ gym, pool & roof, nr Riverwalk",107434423,Blueground,,Near North Side,41.890516,-87.635955,Apartment,169,...,,0,0,0,0,0,0,0,0,0


In [23]:
dcm1.change_df(super_ori_pd,original_pd[super_ori_pd.columns],"perturbed_dataset","s1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', ((66, 66, 2, 12), 'November 28, 2022', (2, 12)))
DEBUG:root:('change_values idxlist:', ((89, 89, 3, 8), 'Entire home', (3, 8)))
DEBUG:root:('change_values idxlist:', ((167, 167, 6, 5), 'Lhke eieu', (6, 5)))
DEBUG:root:('change_values idxlist:', ((170, 170, 6, 8), 'Entire home', (6, 8)))
DEBUG:root:('change_values idxlist:', ((224, 224, 8, 8), 'Entire home', (8, 8)))
DEBUG:root:('change_values idxlist:', ((227, 227, 8, 11), '7i4', (8, 11)))
DEBUG:root:('change_values idxlist:', ((248, 248, 9, 5), 'LgScoLn sark', (9, 5)))
DEBUG:root:('change_values idxlist:', ((283, 283, 10, 13), '0.6', (10, 13)))
DEBUG:root:('change_values idxlist:', ((329, 329, 12, 5), 'LFnxolfWPark', (12, 5)))
DEBUG:root:('change_values idxlist:', ((356, 356, 13, 5), 'Linikln dlrk', (13, 5)))
DEBUG:root:('change_values idxlist:', ((359, 359, 13, 8), 'Entire home', (13, 8)))
DEBUG:root:('change_values idxlist:', ((383, 383, 14, 5), 'poban Sfuaro', (14, 5)))
DEBUG:root:('change_valu

DEBUG:root:('change_values idxlist:', ((4383, 4383, 162, 9), '', (162, 9)))
DEBUG:root:('change_values idxlist:', ((4385, 4385, 162, 11), '', (162, 11)))
DEBUG:root:('change_values idxlist:', ((4409, 4409, 163, 8), 'Entire home', (163, 8)))
DEBUG:root:('change_values idxlist:', ((4414, 4414, 163, 13), '2.4', (163, 13)))
DEBUG:root:('change_values idxlist:', ((4485, 4485, 166, 3), 'FKari', (166, 3)))
DEBUG:root:('change_values idxlist:', ((4514, 4514, 167, 5), 'ciKcolnXPPmY', (167, 5)))
DEBUG:root:('change_values idxlist:', ((4541, 4541, 168, 5), 'ciKcolnXPPmY', (168, 5)))
DEBUG:root:('change_values idxlist:', ((4649, 4649, 172, 5), 'Ugyown', (172, 5)))
DEBUG:root:('change_values idxlist:', ((4676, 4676, 173, 5), 'xpRown', (173, 5)))
DEBUG:root:('change_values idxlist:', ((4682, 4682, 173, 11), 'A41', (173, 11)))
DEBUG:root:('change_values idxlist:', ((4699, 4699, 174, 1), '', (174, 1)))
DEBUG:root:('change_values idxlist:', ((4707, 4707, 174, 9), '', (174, 9)))
DEBUG:root:('change_valu

DEBUG:root:('change_values idxlist:', ((7433, 7433, 275, 8), 'Entire home', (275, 8)))
DEBUG:root:('change_values idxlist:', ((7460, 7460, 276, 8), 'Entire home', (276, 8)))
DEBUG:root:('change_values idxlist:', ((7509, 7509, 278, 3), 'Jo suhua', (278, 3)))
DEBUG:root:('change_values idxlist:', ((7538, 7538, 279, 5), 'IJke Vieg', (279, 5)))
DEBUG:root:('change_values idxlist:', ((7573, 7573, 280, 13), '0.4', (280, 13)))
DEBUG:root:('change_values idxlist:', ((7754, 7754, 287, 5), 'NonthIPCrk', (287, 5)))
DEBUG:root:('change_values idxlist:', ((7841, 7841, 290, 11), 'K432', (290, 11)))
DEBUG:root:('change_values idxlist:', ((7842, 7842, 290, 12), 'December 18, 2022', (290, 12)))
DEBUG:root:('change_values idxlist:', ((7863, 7863, 291, 6), 'QJ41.95089', (291, 6)))
DEBUG:root:('change_values idxlist:', ((7890, 7890, 292, 6), '41W.o94713', (292, 6)))
DEBUG:root:('change_values idxlist:', ((7896, 7896, 292, 12), 'November 27, 2022', (292, 12)))
DEBUG:root:('change_values idxlist:', ((7916, 

DEBUG:root:('change_values idxlist:', ((11426, 11426, 423, 5), 'LincolnKSqusqe', (423, 5)))
DEBUG:root:('change_values idxlist:', ((11455, 11455, 424, 7), 'w-87.e64M159', (424, 7)))
DEBUG:root:('change_values idxlist:', ((11456, 11456, 424, 8), 'Entire home', (424, 8)))
DEBUG:root:('change_values idxlist:', ((11480, 11480, 425, 5), 'GrBnd BouCevarU', (425, 5)))
DEBUG:root:('change_values idxlist:', ((11562, 11562, 428, 6), '41.U768F45', (428, 6)))
DEBUG:root:('change_values idxlist:', ((11613, 11613, 430, 3), 'NatMe', (430, 3)))
DEBUG:root:('change_values idxlist:', ((11616, 11616, 430, 6), '41u.ut783544', (430, 6)))
DEBUG:root:('change_values idxlist:', ((11623, 11623, 430, 13), '0.3', (430, 13)))
DEBUG:root:('change_values idxlist:', ((11645, 11645, 431, 8), 'Entire home', (431, 8)))
DEBUG:root:('change_values idxlist:', ((11648, 11648, 431, 11), 'u13', (431, 11)))
DEBUG:root:('change_values idxlist:', ((11672, 11672, 432, 8), 'Entire home', (432, 8)))
DEBUG:root:('change_values idxl

DEBUG:root:('change_values idxlist:', ((14642, 14642, 542, 8), 'Entire home', (542, 8)))
DEBUG:root:('change_values idxlist:', ((14646, 14646, 542, 12), 'November 25, 2022', (542, 12)))
DEBUG:root:('change_values idxlist:', ((14726, 14726, 545, 11), 'x27', (545, 11)))
DEBUG:root:('change_values idxlist:', ((14747, 14747, 546, 5), 'LakepVkew', (546, 5)))
DEBUG:root:('change_values idxlist:', ((14750, 14750, 546, 8), 'Entire home', (546, 8)))
DEBUG:root:('change_values idxlist:', ((14774, 14774, 547, 5), 'LakepVkew', (547, 5)))
DEBUG:root:('change_values idxlist:', ((14777, 14777, 547, 8), 'Entire home', (547, 8)))
DEBUG:root:('change_values idxlist:', ((14829, 14829, 549, 6), '42.019PW36', (549, 6)))
DEBUG:root:('change_values idxlist:', ((14963, 14963, 554, 5), 'foop', (554, 5)))
DEBUG:root:('change_values idxlist:', ((15019, 15019, 556, 7), '-8F7.7eW0664', (556, 7)))
DEBUG:root:('change_values idxlist:', ((15020, 15020, 556, 8), 'Entire home', (556, 8)))
DEBUG:root:('change_values idx

DEBUG:root:('change_values idxlist:', ((18720, 18720, 693, 9), '', (693, 9)))
DEBUG:root:('change_values idxlist:', ((18723, 18723, 693, 12), '', (693, 12)))
DEBUG:root:('change_values idxlist:', ((18777, 18777, 695, 12), 'August 01, 2022', (695, 12)))
DEBUG:root:('change_values idxlist:', ((18824, 18824, 697, 5), 'TrmaeeA GraxO Crossing', (697, 5)))
DEBUG:root:('change_values idxlist:', ((18849, 18849, 698, 3), 'KerilRLyn', (698, 3)))
DEBUG:root:('change_values idxlist:', ((18854, 18854, 698, 8), 'Entire home', (698, 8)))
DEBUG:root:('change_values idxlist:', ((18962, 18962, 702, 8), 'Entire home', (702, 8)))
DEBUG:root:('change_values idxlist:', ((19040, 19040, 705, 5), 'GraxduBcuEevard', (705, 5)))
DEBUG:root:('change_values idxlist:', ((19119, 19119, 708, 3), 'KYKristin', (708, 3)))
DEBUG:root:('change_values idxlist:', ((19204, 19204, 711, 7), 'Q-8C7.6451M6', (711, 7)))
DEBUG:root:('change_values idxlist:', ((19256, 19256, 713, 5), 'WoAdlaKn', (713, 5)))
DEBUG:root:('change_values

DEBUG:root:('change_values idxlist:', ((22661, 22661, 839, 8), 'Entire home', (839, 8)))
DEBUG:root:('change_values idxlist:', ((22688, 22688, 840, 8), 'Entire home', (840, 8)))
DEBUG:root:('change_values idxlist:', ((22712, 22712, 841, 5), 'JefYersnnQPkrk', (841, 5)))
DEBUG:root:('change_values idxlist:', ((22713, 22713, 841, 6), 'x4P1.98126', (841, 6)))
DEBUG:root:('change_values idxlist:', ((22742, 22742, 842, 8), 'Entire home', (842, 8)))
DEBUG:root:('change_values idxlist:', ((22747, 22747, 842, 13), '0.7', (842, 13)))
DEBUG:root:('change_values idxlist:', ((22847, 22847, 846, 5), 'HydeDPKPk', (846, 5)))
DEBUG:root:('change_values idxlist:', ((22874, 22874, 847, 5), 'HydeDPKPk', (847, 5)))
DEBUG:root:('change_values idxlist:', ((22908, 22908, 848, 12), 'December 11, 2022', (848, 12)))
DEBUG:root:('change_values idxlist:', ((22909, 22909, 848, 13), '2.7', (848, 13)))
DEBUG:root:('change_values idxlist:', ((22936, 22936, 849, 13), '1.4', (849, 13)))
DEBUG:root:('change_values idxlis

DEBUG:root:('change_values idxlist:', ((26176, 26176, 969, 13), '1.5', (969, 13)))
DEBUG:root:('change_values idxlist:', ((26230, 26230, 971, 13), '4.5', (971, 13)))
DEBUG:root:('change_values idxlist:', ((26249, 26249, 972, 5), 'cirwood ParG', (972, 5)))
DEBUG:root:('change_values idxlist:', ((26252, 26252, 972, 8), 'Entire home', (972, 8)))
DEBUG:root:('change_values idxlist:', ((26284, 26284, 973, 13), '4.4', (973, 13)))
DEBUG:root:('change_values idxlist:', ((26306, 26306, 974, 8), 'Entire home', (974, 8)))
DEBUG:root:('change_values idxlist:', ((26330, 26330, 975, 5), 'hoodlawN', (975, 5)))
DEBUG:root:('change_values idxlist:', ((26331, 26331, 975, 6), '41Aj.77544', (975, 6)))
DEBUG:root:('change_values idxlist:', ((26360, 26360, 976, 8), 'Entire home', (976, 8)))
DEBUG:root:('change_values idxlist:', ((26546, 26546, 983, 5), 'LincocnUPaDJ', (983, 5)))
DEBUG:root:('change_values idxlist:', ((26576, 26576, 984, 8), 'Entire home', (984, 8)))
DEBUG:root:('change_values idxlist:', ((2

DEBUG:root:('change_values idxlist:', ((29651, 29651, 1098, 5), 'Mggan Squaks', (1098, 5)))
DEBUG:root:('change_values idxlist:', ((29707, 29707, 1100, 7), 'dR-87.63Y386', (1100, 7)))
DEBUG:root:('change_values idxlist:', ((29708, 29708, 1100, 8), 'Entire home', (1100, 8)))
DEBUG:root:('change_values idxlist:', ((29757, 29757, 1102, 3), 'PveTter', (1102, 3)))
DEBUG:root:('change_values idxlist:', ((29789, 29789, 1103, 8), 'Entire home', (1103, 8)))
DEBUG:root:('change_values idxlist:', ((29814, 29814, 1104, 6), '41G.939V11', (1104, 6)))
DEBUG:root:('change_values idxlist:', ((29840, 29840, 1105, 5), 'ChFthKm', (1105, 5)))
DEBUG:root:('change_values idxlist:', ((29848, 29848, 1105, 13), '3.2', (1105, 13)))
DEBUG:root:('change_values idxlist:', ((29921, 29921, 1108, 5), 'Moop', (1108, 5)))
DEBUG:root:('change_values idxlist:', ((29924, 29924, 1108, 8), 'Entire home', (1108, 8)))
DEBUG:root:('change_values idxlist:', ((29951, 29951, 1109, 8), 'Entire home', (1109, 8)))
DEBUG:root:('change

DEBUG:root:('change_values idxlist:', ((32567, 32567, 1206, 5), 'SowTdOWeqt Sldc', (1206, 5)))
DEBUG:root:('change_values idxlist:', ((32568, 32568, 1206, 6), '41.85lW792', (1206, 6)))
DEBUG:root:('change_values idxlist:', ((32594, 32594, 1207, 5), 'SowTdOWeqt Sldc', (1207, 5)))
DEBUG:root:('change_values idxlist:', ((32595, 32595, 1207, 6), '41.85lW792', (1207, 6)))
DEBUG:root:('change_values idxlist:', ((32656, 32656, 1209, 13), '1.3', (1209, 13)))
DEBUG:root:('change_values idxlist:', ((32673, 32673, 1210, 3), ' BlUuegroSund', (1210, 3)))
DEBUG:root:('change_values idxlist:', ((32678, 32678, 1210, 8), 'Entire home', (1210, 8)))
DEBUG:root:('change_values idxlist:', ((32704, 32704, 1211, 7), '-87.6gSs4454', (1211, 7)))
DEBUG:root:('change_values idxlist:', ((32705, 32705, 1211, 8), 'Entire home', (1211, 8)))
DEBUG:root:('change_values idxlist:', ((32737, 32737, 1212, 13), '0.3', (1212, 13)))
DEBUG:root:('change_values idxlist:', ((32783, 32783, 1214, 5), 'Doop', (1214, 5)))
DEBUG:roo

DEBUG:root:('change_values idxlist:', ((35510, 35510, 1315, 5), 'LmnciJnFPark', (1315, 5)))
DEBUG:root:('change_values idxlist:', ((35544, 35544, 1316, 12), 'December 16, 2022', (1316, 12)))
DEBUG:root:('change_values idxlist:', ((35619, 35619, 1319, 6), '41.m96l028', (1319, 6)))
DEBUG:root:('change_values idxlist:', ((35699, 35699, 1322, 5), 'JxffersonUuark', (1322, 5)))
DEBUG:root:('change_values idxlist:', ((35753, 35753, 1324, 5), 'GreaDer GraNa CrosPiMg', (1324, 5)))
DEBUG:root:('change_values idxlist:', ((35781, 35781, 1325, 6), 'CTo41.7270344', (1325, 6)))
DEBUG:root:('change_values idxlist:', ((35783, 35783, 1325, 8), 'Entire home', (1325, 8)))
DEBUG:root:('change_values idxlist:', ((35832, 35832, 1327, 3), 'RIYshin', (1327, 3)))
DEBUG:root:('change_values idxlist:', ((35888, 35888, 1329, 5), 'SoutH LuSndvle', (1329, 5)))
DEBUG:root:('change_values idxlist:', ((35891, 35891, 1329, 8), 'Entire home', (1329, 8)))
DEBUG:root:('change_values idxlist:', ((35945, 35945, 1331, 8), 'En

DEBUG:root:('change_values idxlist:', ((39207, 39207, 1452, 3), 'LMane', (1452, 3)))
DEBUG:root:('change_values idxlist:', ((39209, 39209, 1452, 5), 'RoSers TaDk', (1452, 5)))
DEBUG:root:('change_values idxlist:', ((39297, 39297, 1455, 12), 'December 01, 2022', (1455, 12)))
DEBUG:root:('change_values idxlist:', ((39298, 39298, 1455, 13), '1.0', (1455, 13)))
DEBUG:root:('change_values idxlist:', ((39452, 39452, 1461, 5), 'Sooclawn', (1461, 5)))
DEBUG:root:('change_values idxlist:', ((39455, 39455, 1461, 8), 'Entire home', (1461, 8)))
DEBUG:root:('change_values idxlist:', ((39487, 39487, 1462, 13), '1.0', (1462, 13)))
DEBUG:root:('change_values idxlist:', ((39560, 39560, 1465, 5), 'RPgersbPaCk', (1465, 5)))
DEBUG:root:('change_values idxlist:', ((39568, 39568, 1465, 13), '1.0', (1465, 13)))
DEBUG:root:('change_values idxlist:', ((39587, 39587, 1466, 5), 'Logan Mquarq', (1466, 5)))
DEBUG:root:('change_values idxlist:', ((39589, 39589, 1466, 7), 'h-87S.71n622', (1466, 7)))
DEBUG:root:('cha

DEBUG:root:('change_values idxlist:', ((42908, 42908, 1589, 5), 'NeaJr NocrthS Side', (1589, 5)))
DEBUG:root:('change_values idxlist:', ((42962, 42962, 1591, 5), 'Nejaro Northh Side', (1591, 5)))
DEBUG:root:('change_values idxlist:', ((42965, 42965, 1591, 8), 'Apartment', (1591, 8)))
DEBUG:root:('change_values idxlist:', ((42989, 42989, 1592, 5), 'Near WeUst Siwdue', (1592, 5)))
DEBUG:root:('change_values idxlist:', ((43022, 43022, 1593, 11), 'U198', (1593, 11)))
DEBUG:root:('change_values idxlist:', ((43041, 43041, 1594, 3), 'Disne', (1594, 3)))
DEBUG:root:('change_values idxlist:', ((43043, 43043, 1594, 5), 'NearO Wgegst Side', (1594, 5)))
DEBUG:root:('change_values idxlist:', ((43097, 43097, 1596, 5), 'DuEnnAaing', (1596, 5)))
DEBUG:root:('change_values idxlist:', ((43205, 43205, 1600, 5), 'NEast GarFfiIeld Park', (1600, 5)))
DEBUG:root:('change_values idxlist:', ((43211, 43211, 1600, 11), '1F25', (1600, 11)))
DEBUG:root:('change_values idxlist:', ((43240, 43240, 1601, 13), '0.5', (

DEBUG:root:('change_values idxlist:', ((46200, 46200, 1711, 3), 'Kashao& Owen', (1711, 3)))
DEBUG:root:('change_values idxlist:', ((46205, 46205, 1711, 8), 'Apartment', (1711, 8)))
DEBUG:root:('change_values idxlist:', ((46227, 46227, 1712, 3), 'Kashao& Owen', (1712, 3)))
DEBUG:root:('change_values idxlist:', ((46229, 46229, 1712, 5), '', (1712, 5)))
DEBUG:root:('change_values idxlist:', ((46232, 46232, 1712, 8), 'Apartment', (1712, 8)))
DEBUG:root:('change_values idxlist:', ((46235, 46235, 1712, 11), '', (1712, 11)))
DEBUG:root:('change_values idxlist:', ((46236, 46236, 1712, 12), '', (1712, 12)))
DEBUG:root:('change_values idxlist:', ((46281, 46281, 1714, 3), 'FreehGnl', (1714, 3)))
DEBUG:root:('change_values idxlist:', ((46337, 46337, 1716, 5), 'Near Notrethk Side', (1716, 5)))
DEBUG:root:('change_values idxlist:', ((46399, 46399, 1718, 13), '0.7', (1718, 13)))
DEBUG:root:('change_values idxlist:', ((46418, 46418, 1719, 5), 'Wbest ToUwKn', (1719, 5)))
DEBUG:root:('change_values idxl

DEBUG:root:('change_values idxlist:', ((50066, 50066, 1854, 8), 'Apartment', (1854, 8)))
DEBUG:root:('change_values idxlist:', ((50117, 50117, 1856, 5), 'kIrving PSarek', (1856, 5)))
DEBUG:root:('change_values idxlist:', ((50144, 50144, 1857, 5), 'West TPWeown', (1857, 5)))
DEBUG:root:('change_values idxlist:', ((50250, 50250, 1861, 3), 'Cajs', (1861, 3)))
DEBUG:root:('change_values idxlist:', ((50252, 50252, 1861, 5), 'AuburrmMn Gresham', (1861, 5)))
DEBUG:root:('change_values idxlist:', ((50279, 50279, 1862, 5), 'Near gWesMt FSide', (1862, 5)))
DEBUG:root:('change_values idxlist:', ((50281, 50281, 1862, 7), '-8764761', (1862, 7)))
DEBUG:root:('change_values idxlist:', ((50388, 50388, 1866, 6), '4190597', (1866, 6)))
DEBUG:root:('change_values idxlist:', ((50496, 50496, 1870, 6), '4190606', (1870, 6)))
DEBUG:root:('change_values idxlist:', ((50523, 50523, 1871, 6), '4190606', (1871, 6)))
DEBUG:root:('change_values idxlist:', ((50554, 50554, 1872, 10), 'j32', (1872, 10)))
DEBUG:root:('

DEBUG:root:('change_values idxlist:', ((53818, 53818, 1993, 7), '-87616173', (1993, 7)))
DEBUG:root:('change_values idxlist:', ((53845, 53845, 1994, 7), '-87616173', (1994, 7)))
DEBUG:root:('change_values idxlist:', ((53876, 53876, 1995, 11), 't61', (1995, 11)))
DEBUG:root:('change_values idxlist:', ((53924, 53924, 1997, 5), 'Edgxewbfater', (1997, 5)))
DEBUG:root:('change_values idxlist:', ((53951, 53951, 1998, 5), 'Edgxewbfater', (1998, 5)))
DEBUG:root:('change_values idxlist:', ((54008, 54008, 2000, 8), 'Apartment', (2000, 8)))
DEBUG:root:('change_values idxlist:', ((54094, 54094, 2003, 13), '1.1', (2003, 13)))
DEBUG:root:('change_values idxlist:', ((54113, 54113, 2004, 5), 'xIrviGng POark', (2004, 5)))
DEBUG:root:('change_values idxlist:', ((54143, 54143, 2005, 8), 'Apartment', (2005, 8)))
DEBUG:root:('change_values idxlist:', ((54172, 54172, 2006, 10), '20p0', (2006, 10)))
DEBUG:root:('change_values idxlist:', ((54357, 54357, 2013, 6), '4186945', (2013, 6)))
DEBUG:root:('change_val

DEBUG:root:('change_values idxlist:', ((57277, 57277, 2121, 10), '3a2', (2121, 10)))
DEBUG:root:('change_values idxlist:', ((57299, 57299, 2122, 5), 'ANeaur NoCrth Side', (2122, 5)))
DEBUG:root:('change_values idxlist:', ((57326, 57326, 2123, 5), 'DEast GJarWfield Park', (2123, 5)))
DEBUG:root:('change_values idxlist:', ((57329, 57329, 2123, 8), 'Apartment', (2123, 8)))
DEBUG:root:('change_values idxlist:', ((57332, 57332, 2123, 11), 'd76', (2123, 11)))
DEBUG:root:('change_values idxlist:', ((57334, 57334, 2123, 13), '2.0', (2123, 13)))
DEBUG:root:('change_values idxlist:', ((57353, 57353, 2124, 5), 'NeaTr NoErth Sirde', (2124, 5)))
DEBUG:root:('change_values idxlist:', ((57356, 57356, 2124, 8), 'Apartment', (2124, 8)))
DEBUG:root:('change_values idxlist:', ((57387, 57387, 2125, 12), '12/05/22', (2125, 12)))
DEBUG:root:('change_values idxlist:', ((57410, 57410, 2126, 8), 'Apartment', (2126, 8)))
DEBUG:root:('change_values idxlist:', ((57436, 57436, 2127, 7), '-8769382', (2127, 7)))
DEB

DEBUG:root:('change_values idxlist:', ((60891, 60891, 2255, 6), '419347', (2255, 6)))
DEBUG:root:('change_values idxlist:', ((60893, 60893, 2255, 8), 'Apartment', (2255, 8)))
DEBUG:root:('change_values idxlist:', ((60917, 60917, 2256, 5), 'Wmerst HTown', (2256, 5)))
DEBUG:root:('change_values idxlist:', ((60944, 60944, 2257, 5), 'NrQear Noruth Side', (2257, 5)))
DEBUG:root:('change_values idxlist:', ((61114, 61114, 2263, 13), '3.1', (2263, 13)))
DEBUG:root:('change_values idxlist:', ((61214, 61214, 2267, 5), 'Near OvtWest Side', (2267, 5)))
DEBUG:root:('change_values idxlist:', ((61249, 61249, 2268, 13), '1.1', (2268, 13)))
DEBUG:root:('change_values idxlist:', ((61271, 61271, 2269, 8), 'Apartment', (2269, 8)))
DEBUG:root:('change_values idxlist:', ((61322, 61322, 2271, 5), 'NUearT Northc Side', (2271, 5)))
DEBUG:root:('change_values idxlist:', ((61324, 61324, 2271, 7), '-876258', (2271, 7)))
DEBUG:root:('change_values idxlist:', ((61329, 61329, 2271, 12), '02/10/22', (2271, 12)))
DEBU

DEBUG:root:('change_values idxlist:', ((64267, 64267, 2380, 7), '-87655685', (2380, 7)))
DEBUG:root:('change_values idxlist:', ((64272, 64272, 2380, 12), '12/12/22', (2380, 12)))
DEBUG:root:('change_values idxlist:', ((64322, 64322, 2382, 8), 'Apartment', (2382, 8)))
DEBUG:root:('change_values idxlist:', ((64378, 64378, 2384, 10), '3g2', (2384, 10)))
DEBUG:root:('change_values idxlist:', ((64381, 64381, 2384, 13), '0.2', (2384, 13)))
DEBUG:root:('change_values idxlist:', ((64402, 64402, 2385, 7), '-876938', (2385, 7)))
DEBUG:root:('change_values idxlist:', ((64452, 64452, 2387, 3), 'DaBeed', (2387, 3)))
DEBUG:root:('change_values idxlist:', ((64454, 64454, 2387, 5), 'WeKMst PLullman', (2387, 5)))
DEBUG:root:('change_values idxlist:', ((64535, 64535, 2390, 5), 'BrViBdigeport', (2390, 5)))
DEBUG:root:('change_values idxlist:', ((64564, 64564, 2391, 7), '-8761823', (2391, 7)))
DEBUG:root:('change_values idxlist:', ((64589, 64589, 2392, 5), 'West RxirdgIe', (2392, 5)))
DEBUG:root:('change_

DEBUG:root:('change_values idxlist:', ((67485, 67485, 2499, 12), '11/25/22', (2499, 12)))
DEBUG:root:('change_values idxlist:', ((67512, 67512, 2500, 12), '11/25/22', (2500, 12)))
DEBUG:root:('change_values idxlist:', ((67559, 67559, 2502, 5), 'SWGest bTown', (2502, 5)))
DEBUG:root:('change_values idxlist:', ((67586, 67586, 2503, 5), 'EaswDt Garfiexld Park', (2503, 5)))
DEBUG:root:('change_values idxlist:', ((67613, 67613, 2504, 5), 'RdoKwseland', (2504, 5)))
DEBUG:root:('change_values idxlist:', ((67619, 67619, 2504, 11), '2d8', (2504, 11)))
DEBUG:root:('change_values idxlist:', ((67641, 67641, 2505, 6), '418634989', (2505, 6)))
DEBUG:root:('change_values idxlist:', ((67667, 67667, 2506, 5), 'IrvingE CPartk', (2506, 5)))
DEBUG:root:('change_values idxlist:', ((67721, 67721, 2508, 5), 'QEodgewa ter', (2508, 5)))
DEBUG:root:('change_values idxlist:', ((67802, 67802, 2511, 5), 'NeSar Norrtwh Side', (2511, 5)))
DEBUG:root:('change_values idxlist:', ((67829, 67829, 2512, 5), 'Avokrndkale',

DEBUG:root:('change_values idxlist:', ((71072, 71072, 2632, 8), 'Apartment', (2632, 8)))
DEBUG:root:('change_values idxlist:', ((71096, 71096, 2633, 5), 'Near WesQBt jSide', (2633, 5)))
DEBUG:root:('change_values idxlist:', ((71158, 71158, 2635, 13), '0.7', (2635, 13)))
DEBUG:root:('change_values idxlist:', ((71184, 71184, 2636, 12), '11/28/22', (2636, 12)))
DEBUG:root:('change_values idxlist:', ((71204, 71204, 2637, 5), 'Nelhar Westt Side', (2637, 5)))
DEBUG:root:('change_values idxlist:', ((71234, 71234, 2638, 8), 'Apartment', (2638, 8)))
DEBUG:root:('change_values idxlist:', ((71237, 71237, 2638, 11), '1A2', (2638, 11)))
DEBUG:root:('change_values idxlist:', ((71292, 71292, 2640, 12), '11/13/22', (2640, 12)))
DEBUG:root:('change_values idxlist:', ((71373, 71373, 2643, 12), '10/31/22', (2643, 12)))
DEBUG:root:('change_values idxlist:', ((71374, 71374, 2643, 13), '3.5', (2643, 13)))
DEBUG:root:('change_values idxlist:', ((71393, 71393, 2644, 5), 'NeUar WesBt Sjide', (2644, 5)))
DEBUG:

DEBUG:root:('change_values idxlist:', ((74170, 74170, 2747, 1), '', (2747, 1)))
DEBUG:root:('change_values idxlist:', ((74174, 74174, 2747, 5), '', (2747, 5)))
DEBUG:root:('change_values idxlist:', ((74178, 74178, 2747, 9), '', (2747, 9)))
DEBUG:root:('change_values idxlist:', ((74181, 74181, 2747, 12), '12/05/22', (2747, 12)))
DEBUG:root:('change_values idxlist:', ((74208, 74208, 2748, 12), '12/05/22', (2748, 12)))
DEBUG:root:('change_values idxlist:', ((74282, 74282, 2751, 5), 'DUiunnking', (2751, 5)))
DEBUG:root:('change_values idxlist:', ((74285, 74285, 2751, 8), 'Apartment', (2751, 8)))
DEBUG:root:('change_values idxlist:', ((74290, 74290, 2751, 13), '1.0', (2751, 13)))
DEBUG:root:('change_values idxlist:', ((74498, 74498, 2759, 5), 'AIrmMour Squuare', (2759, 5)))
DEBUG:root:('change_values idxlist:', ((74499, 74499, 2759, 6), '4184523', (2759, 6)))
DEBUG:root:('change_values idxlist:', ((74506, 74506, 2759, 13), '3.3', (2759, 13)))
DEBUG:root:('change_values idxlist:', ((74523, 7

DEBUG:root:('change_values idxlist:', ((77576, 77576, 2873, 5), 'QEdgeawatAer', (2873, 5)))
DEBUG:root:('change_values idxlist:', ((77630, 77630, 2875, 5), 'Near SoyuHtGh Side', (2875, 5)))
DEBUG:root:('change_values idxlist:', ((77659, 77659, 2876, 7), '-8758562', (2876, 7)))
DEBUG:root:('change_values idxlist:', ((77768, 77768, 2880, 8), 'Apartment', (2880, 8)))
DEBUG:root:('change_values idxlist:', ((77817, 77817, 2882, 3), 'lj', (2882, 3)))
DEBUG:root:('change_values idxlist:', ((77819, 77819, 2882, 5), 'Near dNortih Sfide', (2882, 5)))
DEBUG:root:('change_values idxlist:', ((77881, 77881, 2884, 13), '1.7', (2884, 13)))
DEBUG:root:('change_values idxlist:', ((77898, 77898, 2885, 3), 'IBy', (2885, 3)))
DEBUG:root:('change_values idxlist:', ((77905, 77905, 2885, 10), '3q2', (2885, 10)))
DEBUG:root:('change_values idxlist:', ((77981, 77981, 2888, 5), 'EgasGt Garfield PaDrk', (2888, 5)))
DEBUG:root:('change_values idxlist:', ((78006, 78006, 2889, 3), 'Oafael', (2889, 3)))
DEBUG:root:('

DEBUG:root:('change_values idxlist:', ((81195, 81195, 3007, 6), '419081166', (3007, 6)))
DEBUG:root:('change_values idxlist:', ((81221, 81221, 3008, 5), 'uWesjt Towxn', (3008, 5)))
DEBUG:root:('change_values idxlist:', ((81275, 81275, 3010, 5), 'NhUDear West Side', (3010, 5)))
DEBUG:root:('change_values idxlist:', ((81383, 81383, 3014, 5), '', (3014, 5)))
DEBUG:root:('change_values idxlist:', ((81384, 81384, 3014, 6), '', (3014, 6)))
DEBUG:root:('change_values idxlist:', ((81472, 81472, 3017, 13), '1.0', (3017, 13)))
DEBUG:root:('change_values idxlist:', ((81489, 81489, 3018, 3), 'xoliEna', (3018, 3)))
DEBUG:root:('change_values idxlist:', ((81499, 81499, 3018, 13), '1.0', (3018, 13)))
DEBUG:root:('change_values idxlist:', ((81710, 81710, 3026, 8), 'Apartment', (3026, 8)))
DEBUG:root:('change_values idxlist:', ((81734, 81734, 3027, 5), 'NeHar XSoutwh Side', (3027, 5)))
DEBUG:root:('change_values idxlist:', ((81759, 81759, 3028, 3), 'AmdreP', (3028, 3)))
DEBUG:root:('change_values idxli

DEBUG:root:('change_values idxlist:', ((84594, 84594, 3133, 3), 'Nesise', (3133, 3)))
DEBUG:root:('change_values idxlist:', ((84604, 84604, 3133, 13), '2.0', (3133, 13)))
DEBUG:root:('change_values idxlist:', ((84623, 84623, 3134, 5), 'FNear WFest Slide', (3134, 5)))
DEBUG:root:('change_values idxlist:', ((84631, 84631, 3134, 13), '3.0', (3134, 13)))
DEBUG:root:('change_values idxlist:', ((84655, 84655, 3135, 10), 'M32', (3135, 10)))
DEBUG:root:('change_values idxlist:', ((84673, 84673, 3136, 1), '', (3136, 1)))
DEBUG:root:('change_values idxlist:', ((84677, 84677, 3136, 5), '', (3136, 5)))
DEBUG:root:('change_values idxlist:', ((84678, 84678, 3136, 6), '', (3136, 6)))
DEBUG:root:('change_values idxlist:', ((84761, 84761, 3139, 8), 'Apartment', (3139, 8)))
DEBUG:root:('change_values idxlist:', ((84766, 84766, 3139, 13), '1.0', (3139, 13)))
DEBUG:root:('change_values idxlist:', ((84847, 84847, 3142, 13), '4.0', (3142, 13)))
DEBUG:root:('change_values idxlist:', ((84874, 84874, 3143, 13)

True

In [24]:
xx = pd.DataFrame(dcm1.cell_values)
xx[xx[2]==0].merge(xx,left_on=4,right_on=0)

Unnamed: 0,4,0_x,1_x,2_x,3_x,4_x,0_y,1_y,2_y,3_y,4_y
0,66,86427,66,0,"November 28, 2022",66,66,66,-1,2022-11-28,-1
1,89,86428,89,0,Entire home,89,89,89,-1,Entire home/apt,-1
2,167,86429,167,0,Lhke eieu,167,167,167,-1,Lake View,-1
3,170,86430,170,0,Entire home,170,170,170,-1,Entire home/apt,-1
4,224,86431,224,0,Entire home,224,224,224,-1,Entire home/apt,-1
...,...,...,...,...,...,...,...,...,...,...,...
2286,86246,88713,86246,0,Apartment,86246,86246,86246,-1,Entire home/apt,-1
2287,86273,88714,86273,0,Apartment,86273,86273,86273,-1,Entire home/apt,-1
2288,86327,88715,86327,0,Apartment,86327,86327,86327,-1,Entire home/apt,-1
2289,86381,88716,86381,0,Apartment,86381,86381,86381,-1,Entire home/apt,-1


In [25]:
xx[xx[1]==3624]

Unnamed: 0,0,1,2,3,4
3624,3624,3624,-1,41.92748,-1
86509,86509,3624,0,E41.9A2748,3624


In [26]:
dcm1.curr_col = original_pd.columns
dcm1.curr_row = list(original_pd.index) 
dcm1.curr_df = original_pd

In [27]:
#for i,c in enumerate(change_index.columns):
#    dcm1.pd_index.loc[change_index.loc[:,c].values,i]

In [28]:
pd.DataFrame(dcm1.state_detail)

Unnamed: 0,0,1,2,3,4
0,-1,{'op': 'initial'},0,-2,
1,0,perturbed_dataset,0,-1,0.0


In [29]:
def assign_to_original(original_pd,workset_pd):
    original_pd.loc[original_pd.id.isin(workset_pd.id),:] = workset_pd
    return original_pd

In [30]:
airbnb_pd = pd.read_csv("chicago_vert_dataset.csv",dtype=str)
for x in list(filter(lambda x:x.endswith("flag"),airbnb_pd.columns)):
    airbnb_pd.loc[:,x] = airbnb_pd.loc[:,x].astype(int) 

  airbnb_pd.loc[:,x] = airbnb_pd.loc[:,x].astype(int)


In [31]:
airbnb_pd.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license', 'id_flag',
       'host_id_flag', 'neighbourhood_flag', 'latitude_flag', 'longitude_flag',
       'minimum_nights_flag', 'number_of_reviews_flag', 'last_review_flag',
       'room_type_flag'],
      dtype='object')

In [32]:
airbnb_pd.shape,original_pd.shape

((1486, 27), (3201, 27))

In [33]:
# take part of airbnb_pd from original_pd
# using data frame imutability, any changes will be 
# directed toward those subset of original_pd
#airbnb_pd = original_pd[original_pd.id.isin(airbnb_pd.id)]

# cleanup duplicate id
The ID column must contain unique values. If there are any duplicate values in this column, you will need to take action to ensure that each ID is unique. You can do this by either fixing the duplicates (if you want to keep them) or by flagging them for removal (1) using the id_flag column.

In [34]:
def clean_duplicate_id(df):
    #raise Exception("not yet have implementation")
    # do something here
#     df = df[df.duplicated(['id'])]
    df['id_flag'] = df.duplicated(['id'])
    df['id_flag'] = df['id_flag'].map({False: 0, True: 1})
    return df

In [35]:
dup_ids = airbnb_pd[airbnb_pd.id_flag==0]
dup_ids = dup_ids.groupby("id").count()[["name"]].reset_index()
dup_ids = dup_ids[dup_ids.name>1]

problem_n = dup_ids.shape[0]
dq_problem = problem_n/problem_n
problem_n,dq_problem

(64, 1.0)

In [36]:
airbnb_pd = clean_duplicate_id(airbnb_pd)
airbnb_pd

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
0,25879,2/1 One Block to Fullerton L Red Line Deck & ...,101521,Red,,Lincoln Park,41.92693,-87.65753,Entire home/apt,94,...,City registration pending,0,0,0,0,0,0,0,0,0
1,37738,Andersonville - Perfect location!,162364,Mat And Randy,,Uptown,41.9729,-87.66538,Private room,110,...,R20000059426,0,0,0,0,0,0,0,0,0
2,189821,"Best in Chicago, private, amazing garden space",899757,Meighan,,Logan Square,41.92918,-87.70219,Entire home/apt,236,...,R21000062936,0,0,0,0,0,0,0,0,0
3,207218,Historic Pullman Artist Flat - Artists & Explo...,1019125,Jb,,Pullman,41.6883,-87.60892,Entire home,100,...,R21000073121,0,0,0,0,0,0,0,0,0
4,220333,Pullman School House Apartment - monthly rental,1019125,Jb,,Pullman,41.68815,-87.60918,Entire home/apt,100,...,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1481,781186064034113431,2 Queen Bed Room,431019163,Paul,,LUop,41.88551,-87.63016,Private room,96,...,2506121,0,0,0,0,0,0,0,0,0
1482,782373284583992853,"Logan Square 1br w/ pool, lounge & gym nr L",107434423,Blueground,,Logan Square,41.927101,-87.7043588,Entire home/apt,145,...,,0,0,0,0,0,0,0,0,0
1483,782476073065423478,Affordable apt near Wrigley,252408635,Andie,,Uptown,41.95661,-87.65102,Entire home/apt,58,...,2845517,0,0,0,0,0,0,0,0,0
1484,783249596739848974,"South Loop 1br w/ gym & lounge, nr Grant Park",107434423,Blueground,,Loop,41.8725288,-87.6308608,Entire home/apt,143,...,,0,0,0,0,0,0,0,0,0


# Duplicate IDS checking 
To ensure that all ID values in the dataset are unique, you should check for duplicate IDs. When you run the query to check for duplicates, there should be no rows returned, indicating that there are no duplicate ID values present in the dataset.

In [37]:
dup_ids = airbnb_pd
dup_ids = dup_ids.groupby("id").count()[["name"]].reset_index()
dup_ids = dup_ids[dup_ids.name>1]
removal_flag = (airbnb_pd.id_flag!=0).sum()
corrected = dup_ids.shape[0]


In [38]:
removal_flag

70

In [39]:
completeness = (problem_n-corrected) / problem_n
flagged = removal_flag / problem_n
completeness,flagged


(0.0, 1.09375)

In [40]:
dup_ids = airbnb_pd[airbnb_pd.id_flag==0]
dup_ids = dup_ids.groupby("id").count()[["name"]].reset_index()
dup_ids = dup_ids[dup_ids.name>1]
dup_ids


Unnamed: 0,id,name


In [41]:
dq_post = dup_ids.shape[0]/problem_n
dq_post

0.0

In [42]:
workset_before = original_pd.copy()
workset_after = original_pd.copy()

In [43]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [44]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                 0
name                               0
host_id                            0
host_name                          0
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
number_of_reviews_ltm              0
license                            0
id_flag                           70
host_id_flag                       0
neighbourhood_flag                 0
latitude_flag                      0
longitude_flag                     0
minimum_nights_flag                0
number_of_reviews_flag             0
last_review_flag                   0
room_type_flag                     0
d

In [45]:
dcm1.change_df(workset_before,workset_after,"clean_duplicate_id_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(423, 423, 15, 18), (2313, 2313, 85, 18), (3420, 3420, 126, 18), (4122, 4122, 152, 18), (4392, 4392, 162, 18), (4554, 4554, 168, 18), (4743, 4743, 175, 18), (5040, 5040, 186, 18), (5283, 5283, 195, 18), (5580, 5580, 206, 18), (6039, 6039, 223, 18), (6417, 6417, 237, 18), (6849, 6849, 253, 18), (6930, 6930, 256, 18), (7065, 7065, 261, 18), (8847, 8847, 327, 18), (10089, 10089, 373, 18), (10710, 10710, 396, 18), (11061, 11061, 409, 18), (11169, 11169, 413, 18), (11682, 11682, 432, 18), (11763, 11763, 435, 18), (11871, 11871, 439, 18), (11979, 11979, 443, 18), (12276, 12276, 454, 18), (13248, 13248, 490, 18), (13626, 13626, 504, 18), (13977, 13977, 517, 18), (14274, 14274, 528, 18), (14517, 14517, 537, 18), (14652, 14652, 542, 18), (14787, 14787, 547, 18), (15327, 15327, 567, 18), (16137, 16137, 597, 18), (17406, 17406, 644, 18), (18621, 18621, 689, 18), (18729, 18729, 693, 18), (19431, 19431, 719, 18), (19566, 19566, 724, 18), (20943, 20943, 775, 18

True

In [46]:
dcm1.state,dcm1.user,dcm1.state_detail

([(-1, -2), (0, -1), (1, 0)],
 {None: -1, 's1': 0, 'c1': 1},
 [(-1, {'op': 'initial'}, 0, -2, None),
  (0, 'perturbed_dataset', 0, -1, 0),
  (1, 'clean_duplicate_id_c1', 1, 0, 1)])

In [47]:
#dcm1.change_df(workset_before,workset_after,"clean_duplicate_id_c2","c2",1)

In [48]:
workset_before = workset_after.copy()

In [49]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==0]

Unnamed: 0,0,1,2,3,4
86427,86427,66,0,"November 28, 2022",66
86428,86428,89,0,Entire home,89
86429,86429,167,0,Lhke eieu,167
86430,86430,170,0,Entire home,170
86431,86431,224,0,Entire home,224
...,...,...,...,...,...
88713,88713,86246,0,Apartment,86246
88714,88714,86273,0,Apartment,86273
88715,88715,86327,0,Apartment,86327
88716,88716,86381,0,Apartment,86381


# cleanup inconsistent host id
Each host_id value in the dataset should be associated with only one host_name. However, there may be inconsistencies in the dataset where a host_id is associated with different host_name values.

To clean this up, you can either change the host_name value to a consistent value based on information in the dataset, or flag the host_id_flag column to indicate that the row should be removed from downstream tasks.

For example, if you find that a host_id is associated with multiple host_name values, you may want to investigate further to determine which host_name is correct. If one of the host_name values is clearly incorrect (e.g., a misspelling or a name that does not match the owner of the property), you could update the host_name value to the correct value.

Alternatively, if you cannot determine the correct host_name value, or if you want to exclude the row from downstream tasks for other reasons, you can flag the host_id_flag column with a value of 1 to indicate that the row should be removed.

In [50]:
dup_host_id = airbnb_pd[airbnb_pd.host_id_flag==0]
dup_host_id = dup_host_id.groupby(["host_id","host_name"]).count()[["id"]].reset_index()
dup_host_id = dup_host_id.groupby("host_id").count()["id"].reset_index()

problem_n = dup_host_id.shape[0]
dq_problem = problem_n/problem_n
problem_n,dq_problem

(926, 1.0)

In [51]:
def clean_host_id(df):
    #raise Exception("not yet have implementation")
    # do something here
    grouped = df.groupby('host_id')['host_name'].nunique().reset_index(name='count')
    duplicates = grouped[grouped['count'] > 1]['host_id'].tolist()

    for idx, row in df.iterrows():
        if row['host_id'] in duplicates:
            df.loc[idx, 'host_id_flag'] = 1
    return df

In [52]:
airbnb_pd = clean_host_id(airbnb_pd)

# Inconsistent Host ID checking 

This query should return zero rows once you implement the cleaning process

In [53]:
dup_host_id = airbnb_pd
dup_host_id = dup_host_id.groupby(["host_id","host_name"]).count()[["id"]].reset_index()
dup_host_id = dup_host_id.groupby("host_id").count()["id"].reset_index()
dup_host_id[dup_host_id["id"]>1]
removal_flag = (airbnb_pd.host_id_flag!=0).sum()
corrected = dup_host_id.shape[0]


completeness = (problem_n-corrected) / problem_n
flagged = removal_flag / (problem_n/2)
completeness,flagged

(0.0, 0.7645788336933045)

In [54]:
dup_host_id = airbnb_pd[airbnb_pd.host_id_flag==0]
dup_host_id = dup_host_id.groupby(["host_id","host_name"]).count()[["id"]].reset_index()
dup_host_id = dup_host_id.groupby("host_id").count()["id"].reset_index()
dup_host_id[dup_host_id["id"]>1]

Unnamed: 0,host_id,id


In [55]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [56]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                  0
name                                0
host_id                             0
host_name                           0
neighbourhood_group                 0
neighbourhood                       0
latitude                            0
longitude                           0
room_type                           0
price                               0
minimum_nights                      0
number_of_reviews                   0
last_review                         0
reviews_per_month                   0
calculated_host_listings_count      0
availability_365                    0
number_of_reviews_ltm               0
license                             0
id_flag                             0
host_id_flag                      354
neighbourhood_flag                  0
latitude_flag                       0
longitude_flag                      0
minimum_nights_flag                 0
number_of_reviews_flag              0
last_review_flag                    0
room_type_fl

In [57]:
dcm1.change_df(workset_before,workset_after,"clean_inconsistent_host_id_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(721, 721, 26, 19), (748, 748, 27, 19), (1423, 1423, 52, 19), (1450, 1450, 53, 19), (1477, 1477, 54, 19), (1504, 1504, 55, 19), (1531, 1531, 56, 19), (2449, 2449, 90, 19), (2584, 2584, 95, 19), (2611, 2611, 96, 19), (3151, 3151, 116, 19), (3988, 3988, 147, 19), (4150, 4150, 153, 19), (4258, 4258, 157, 19), (4501, 4501, 166, 19), (5149, 5149, 190, 19), (5500, 5500, 203, 19), (5689, 5689, 210, 19), (5743, 5743, 212, 19), (6553, 6553, 242, 19), (7093, 7093, 262, 19), (7120, 7120, 263, 19), (7336, 7336, 271, 19), (7525, 7525, 278, 19), (7876, 7876, 291, 19), (7903, 7903, 292, 19), (8524, 8524, 315, 19), (8551, 8551, 316, 19), (9361, 9361, 346, 19), (9388, 9388, 347, 19), (9415, 9415, 348, 19), (9442, 9442, 349, 19), (9793, 9793, 362, 19), (10171, 10171, 376, 19), (10495, 10495, 388, 19), (10522, 10522, 389, 19), (10846, 10846, 401, 19), (10927, 10927, 404, 19), (11089, 11089, 410, 19), (11278, 11278, 417, 19), (11332, 11332, 419, 19), (11386, 11386, 4

DEBUG:root:('change_values idxlist:', ((721, 721, 26, 19), 1, (26, 19)))
DEBUG:root:('change_values idxlist:', ((748, 748, 27, 19), 1, (27, 19)))
DEBUG:root:('change_values idxlist:', ((1423, 1423, 52, 19), 1, (52, 19)))
DEBUG:root:('change_values idxlist:', ((1450, 1450, 53, 19), 1, (53, 19)))
DEBUG:root:('change_values idxlist:', ((1477, 1477, 54, 19), 1, (54, 19)))
DEBUG:root:('change_values idxlist:', ((1504, 1504, 55, 19), 1, (55, 19)))
DEBUG:root:('change_values idxlist:', ((1531, 1531, 56, 19), 1, (56, 19)))
DEBUG:root:('change_values idxlist:', ((2449, 2449, 90, 19), 1, (90, 19)))
DEBUG:root:('change_values idxlist:', ((2584, 2584, 95, 19), 1, (95, 19)))
DEBUG:root:('change_values idxlist:', ((2611, 2611, 96, 19), 1, (96, 19)))
DEBUG:root:('change_values idxlist:', ((3151, 3151, 116, 19), 1, (116, 19)))
DEBUG:root:('change_values idxlist:', ((3988, 3988, 147, 19), 1, (147, 19)))
DEBUG:root:('change_values idxlist:', ((4150, 4150, 153, 19), 1, (153, 19)))
DEBUG:root:('change_val

DEBUG:root:('change_values idxlist:', ((16597, 16597, 614, 19), 1, (614, 19)))
DEBUG:root:('change_values idxlist:', ((16624, 16624, 615, 19), 1, (615, 19)))
DEBUG:root:('change_values idxlist:', ((16732, 16732, 619, 19), 1, (619, 19)))
DEBUG:root:('change_values idxlist:', ((16948, 16948, 627, 19), 1, (627, 19)))
DEBUG:root:('change_values idxlist:', ((17029, 17029, 630, 19), 1, (630, 19)))
DEBUG:root:('change_values idxlist:', ((17056, 17056, 631, 19), 1, (631, 19)))
DEBUG:root:('change_values idxlist:', ((17083, 17083, 632, 19), 1, (632, 19)))
DEBUG:root:('change_values idxlist:', ((17110, 17110, 633, 19), 1, (633, 19)))
DEBUG:root:('change_values idxlist:', ((17137, 17137, 634, 19), 1, (634, 19)))
DEBUG:root:('change_values idxlist:', ((17218, 17218, 637, 19), 1, (637, 19)))
DEBUG:root:('change_values idxlist:', ((17245, 17245, 638, 19), 1, (638, 19)))
DEBUG:root:('change_values idxlist:', ((17272, 17272, 639, 19), 1, (639, 19)))
DEBUG:root:('change_values idxlist:', ((17299, 17299

DEBUG:root:('change_values idxlist:', ((26641, 26641, 986, 19), 1, (986, 19)))
DEBUG:root:('change_values idxlist:', ((26722, 26722, 989, 19), 1, (989, 19)))
DEBUG:root:('change_values idxlist:', ((26749, 26749, 990, 19), 1, (990, 19)))
DEBUG:root:('change_values idxlist:', ((26830, 26830, 993, 19), 1, (993, 19)))
DEBUG:root:('change_values idxlist:', ((26857, 26857, 994, 19), 1, (994, 19)))
DEBUG:root:('change_values idxlist:', ((26992, 26992, 999, 19), 1, (999, 19)))
DEBUG:root:('change_values idxlist:', ((27019, 27019, 1000, 19), 1, (1000, 19)))
DEBUG:root:('change_values idxlist:', ((27046, 27046, 1001, 19), 1, (1001, 19)))
DEBUG:root:('change_values idxlist:', ((27073, 27073, 1002, 19), 1, (1002, 19)))
DEBUG:root:('change_values idxlist:', ((27154, 27154, 1005, 19), 1, (1005, 19)))
DEBUG:root:('change_values idxlist:', ((27208, 27208, 1007, 19), 1, (1007, 19)))
DEBUG:root:('change_values idxlist:', ((27235, 27235, 1008, 19), 1, (1008, 19)))
DEBUG:root:('change_values idxlist:', ((

DEBUG:root:('change_values idxlist:', ((36442, 36442, 1349, 19), 1, (1349, 19)))
DEBUG:root:('change_values idxlist:', ((36469, 36469, 1350, 19), 1, (1350, 19)))
DEBUG:root:('change_values idxlist:', ((36550, 36550, 1353, 19), 1, (1353, 19)))
DEBUG:root:('change_values idxlist:', ((36766, 36766, 1361, 19), 1, (1361, 19)))
DEBUG:root:('change_values idxlist:', ((37333, 37333, 1382, 19), 1, (1382, 19)))
DEBUG:root:('change_values idxlist:', ((37360, 37360, 1383, 19), 1, (1383, 19)))
DEBUG:root:('change_values idxlist:', ((37495, 37495, 1388, 19), 1, (1388, 19)))
DEBUG:root:('change_values idxlist:', ((37522, 37522, 1389, 19), 1, (1389, 19)))
DEBUG:root:('change_values idxlist:', ((37630, 37630, 1393, 19), 1, (1393, 19)))
DEBUG:root:('change_values idxlist:', ((37711, 37711, 1396, 19), 1, (1396, 19)))
DEBUG:root:('change_values idxlist:', ((37792, 37792, 1399, 19), 1, (1399, 19)))
DEBUG:root:('change_values idxlist:', ((37927, 37927, 1404, 19), 1, (1404, 19)))
DEBUG:root:('change_values i

True

In [58]:
workset_before = workset_after.copy()

In [59]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==1]

Unnamed: 0,0,1,2,3,4
88718,88718,423,1,1,423
88719,88719,2313,1,1,2313
88720,88720,3420,1,1,3420
88721,88721,4122,1,1,4122
88722,88722,4392,1,1,4392
...,...,...,...,...,...
88783,88783,35037,1,1,35037
88784,88784,35280,1,1,35280
88785,88785,36522,1,1,36522
88786,88786,36630,1,1,36630


# cleanup neighbourhood
The neighbourhood column in the dataset should contain values that match the neighbourhoods defined in the official neighbourhood_list. However, there may be some values in the neighbourhood column that are incorrect due to errors or noise in the data.

To clean up the neighbourhood column, you can try to match each value in the column to a valid neighbourhood in the neighbourhood_list using a string distance function such as abydos. If you can successfully match a value in the neighbourhood column to a neighbourhood in the neighbourhood_list, you can replace the value in the dataset with the correct neighbourhood name.

However, if you are unsure about how to clean up a particular value in the neighbourhood column, or if you cannot match the value to a valid neighbourhood in the neighbourhood_list, you can flag the row for deletion by setting the neighbourhood_flag column to a value of 1. If the value in the neighbourhood column is null and you cannot make a determination based on other information in the dataset, you can set the neighbourhood_flag column to a value of 2 to indicate that the row should be included but the neighbourhood value is null.

You can also use the latitude and longitude columns in the dataset to help match values in the neighbourhood column to valid neighbourhoods in the neighbourhood_list. However, you should be aware that the latitude and longitude values may also contain errors or noise, so you should exercise caution when using these columns to clean up the neighbourhood column.

In [60]:
neighbourhood_list = [ 'Hyde Park', 'West Town', 'Lincoln Park', 'Near West Side', 'Lake View',    'Dunning', 'Rogers Park', 'Logan Square', 'Uptown', 'Edgewater',    'North Center', 'Albany Park', 'West Ridge', 'Pullman', 'Irving Park',    'Beverly', 'Lower West Side', 'Near South Side', 'Near North Side',    'Grand Boulevard', 'Bridgeport', 'Humboldt Park', 'Chatham', 'Kenwood',    'Loop', 'West Lawn', 'Lincoln Square', 'Woodlawn', 'Avondale',    'Forest Glen', 'Portage Park', 'East Garfield Park', 'Washington Park',    'North Lawndale', 'Armour Square', 'South Lawndale', 'South Shore',    'Morgan Park', 'South Deering', 'West Garfield Park', 'Hermosa',    'Mckinley Park', 'Douglas', 'Hegewisch', 'West Elsdon', 'Norwood Park',    'Garfield Ridge', 'Austin', 'Belmont Cragin', 'Jefferson Park', 'Ashburn',    'Greater Grand Crossing', 'North Park', 'Oakland', 'Archer Heights',    'Edison Park', 'Englewood', 'Ohare', 'Brighton Park', 'Chicago Lawn',    'New City', 'South Chicago', 'Mount Greenwood', 'Montclare', 'Roseland',    'West Englewood', 'Calumet Heights', 'Auburn Gresham', 'Fuller Park',    'Avalon Park', 'Burnside', 'Clearing', 'Gage Park', 'West Pullman',    'Washington Heights', 'East Side']
print(neighbourhood_list)

['Hyde Park', 'West Town', 'Lincoln Park', 'Near West Side', 'Lake View', 'Dunning', 'Rogers Park', 'Logan Square', 'Uptown', 'Edgewater', 'North Center', 'Albany Park', 'West Ridge', 'Pullman', 'Irving Park', 'Beverly', 'Lower West Side', 'Near South Side', 'Near North Side', 'Grand Boulevard', 'Bridgeport', 'Humboldt Park', 'Chatham', 'Kenwood', 'Loop', 'West Lawn', 'Lincoln Square', 'Woodlawn', 'Avondale', 'Forest Glen', 'Portage Park', 'East Garfield Park', 'Washington Park', 'North Lawndale', 'Armour Square', 'South Lawndale', 'South Shore', 'Morgan Park', 'South Deering', 'West Garfield Park', 'Hermosa', 'Mckinley Park', 'Douglas', 'Hegewisch', 'West Elsdon', 'Norwood Park', 'Garfield Ridge', 'Austin', 'Belmont Cragin', 'Jefferson Park', 'Ashburn', 'Greater Grand Crossing', 'North Park', 'Oakland', 'Archer Heights', 'Edison Park', 'Englewood', 'Ohare', 'Brighton Park', 'Chicago Lawn', 'New City', 'South Chicago', 'Mount Greenwood', 'Montclare', 'Roseland', 'West Englewood', 'Calu

In [61]:
from Levenshtein import distance

In [62]:
airbnb_pd

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
0,25879,2/1 One Block to Fullerton L Red Line Deck & ...,101521,Red,,Lincoln Park,41.92693,-87.65753,Entire home/apt,94,...,City registration pending,0,0,0,0,0,0,0,0,0
1,37738,Andersonville - Perfect location!,162364,Mat And Randy,,Uptown,41.9729,-87.66538,Private room,110,...,R20000059426,0,0,0,0,0,0,0,0,0
2,189821,"Best in Chicago, private, amazing garden space",899757,Meighan,,Logan Square,41.92918,-87.70219,Entire home/apt,236,...,R21000062936,0,0,0,0,0,0,0,0,0
3,207218,Historic Pullman Artist Flat - Artists & Explo...,1019125,Jb,,Pullman,41.6883,-87.60892,Entire home,100,...,R21000073121,0,0,0,0,0,0,0,0,0
4,220333,Pullman School House Apartment - monthly rental,1019125,Jb,,Pullman,41.68815,-87.60918,Entire home/apt,100,...,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1481,781186064034113431,2 Queen Bed Room,431019163,Paul,,LUop,41.88551,-87.63016,Private room,96,...,2506121,0,1,0,0,0,0,0,0,0
1482,782373284583992853,"Logan Square 1br w/ pool, lounge & gym nr L",107434423,Blueground,,Logan Square,41.927101,-87.7043588,Entire home/apt,145,...,,0,1,0,0,0,0,0,0,0
1483,782476073065423478,Affordable apt near Wrigley,252408635,Andie,,Uptown,41.95661,-87.65102,Entire home/apt,58,...,2845517,0,0,0,0,0,0,0,0,0
1484,783249596739848974,"South Loop 1br w/ gym & lounge, nr Grant Park",107434423,Blueground,,Loop,41.8725288,-87.6308608,Entire home/apt,143,...,,0,1,0,0,0,0,0,0,0


In [63]:
def find_closest_match(string):
    closest_match = ""
    min_distance = float('inf')
    if pd.isna(string):
        return string
    else:
        for term in neighbourhood_list:
            d = distance(string, term)
            if d < min_distance:
                closest_match = term
                min_distance = d
        return closest_match

def clean_neighbourhood(df):
    #raise Exception("not yet have implementation")
    df['neighbourhood'] = df['neighbourhood'].apply(find_closest_match)
    df.loc[~df['neighbourhood'].isin(neighbourhood_list), 'neighbourhood_flag'] = 1
    df.loc[df['neighbourhood'].isna(), 'neighbourhood_flag'] = 2
    return df

In [64]:
neighbourhood_check = airbnb_pd[airbnb_pd.neighbourhood_flag==0]
neighbourhood_check = neighbourhood_check[neighbourhood_check.neighbourhood.apply(lambda x:x not in neighbourhood_list)]
neighbourhood_check[["id","neighbourhood"]]

problem_n = neighbourhood_check.shape[0]
dq_problem = problem_n/problem_n
problem_n,dq_problem

(322, 1.0)

In [65]:
airbnb_pd = clean_neighbourhood(airbnb_pd)

In [66]:
airbnb_pd[airbnb_pd['neighbourhood_flag']==2]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
162,16972979,- lower level in house,86183324,Cristina,,,41.79252,-87.78619,Private room,,...,R18000027972,1,0,2,0,0,0,0,0,0
237,21700360,Large Guest Space with Private Entry in Rodger...,157222289,Elena,,,O41.99X881,-F87.v6O7154,,141.0,...,R17000022274,1,0,2,0,0,0,0,0,0
412,33438071,Urban Coach Casa * parking * 10 min to downtown,127847265,Martin,,,41.8u5C497,,Entire home,55.0,...,City registration pending,0,0,2,0,0,0,0,0,0
596,42757880,"Loop 1BR w/ Gym, Pool, nr Chicago Riverwalk",107434423,Blueground,,,41.8860812,,,151.0,...,32+ Days Listing,0,1,2,0,0,0,0,0,0
643,44566798,"Loop 2BR w/ Gym, Roof, nr Riverwalk & the L",107434423,Blueground,,,41.88558099999999,-87.6257005,,117.0,...,32+days Listing,0,1,2,0,0,0,0,0,0
958,54000131,Idyllic Lakefront Sanctuary w/ Unforgettable V...,331726274,Sally,,,42.02112,-87.6645,,95.0,...,R22000080607,1,0,2,0,0,0,0,0,0


In [67]:
airbnb_pd[airbnb_pd['neighbourhood']==2]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag


# Neighbourhood checking

This query should return zero rows once you implement the cleaning process

In [68]:
neighbourhood_check = airbnb_pd
neighbourhood_check = neighbourhood_check[neighbourhood_check.neighbourhood.apply(lambda x:x not in neighbourhood_list)]
neighbourhood_check[["id","neighbourhood"]]

removal_flag = (airbnb_pd.neighbourhood_flag!=0).sum()
corrected = neighbourhood_check.shape[0]

completeness = (problem_n-corrected) / problem_n
flagged = removal_flag / (problem_n/2)
completeness,flagged

(0.9813664596273292, 0.037267080745341616)

In [69]:
neighbourhood_check = airbnb_pd[airbnb_pd.neighbourhood_flag==0]
neighbourhood_check = neighbourhood_check[neighbourhood_check.neighbourhood.apply(lambda x:x not in neighbourhood_list)]
neighbourhood_check[["id","neighbourhood"]]

Unnamed: 0,id,neighbourhood


In [70]:
workset_before = workset_after.copy()

In [71]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [72]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                  0
name                                0
host_id                             0
host_name                           0
neighbourhood_group                 0
neighbourhood                     316
latitude                            0
longitude                           0
room_type                           0
price                               0
minimum_nights                      0
number_of_reviews                   0
last_review                         0
reviews_per_month                   0
calculated_host_listings_count      0
availability_365                    0
number_of_reviews_ltm               0
license                             0
id_flag                             0
host_id_flag                        0
neighbourhood_flag                  6
latitude_flag                       0
longitude_flag                      0
minimum_nights_flag                 0
number_of_reviews_flag              0
last_review_flag                    0
room_type_fl

In [73]:
dcm1.change_df(workset_before,workset_after,"clean_neighbourhood_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(167, 86429, 6, 5), (248, 86433, 9, 5), (329, 86435, 12, 5), (356, 86436, 13, 5), (383, 86438, 14, 5), (410, 86439, 15, 5), (572, 86444, 21, 5), (842, 86449, 31, 5), (977, 86452, 36, 5), (1085, 86457, 40, 5), (1220, 86458, 45, 5), (1355, 86459, 50, 5), (1382, 86460, 51, 5), (1409, 86463, 52, 5), (1544, 86467, 57, 5), (1733, 86472, 64, 5), (1760, 86473, 65, 5), (1949, 86476, 72, 5), (2111, 86478, 78, 5), (2273, 86480, 84, 5), (2300, 86481, 85, 5), (2408, 86483, 89, 5), (2435, 86484, 90, 5), (2462, 86487, 91, 5), (2651, 86490, 98, 5), (2948, 86493, 109, 5), (3002, 86494, 111, 5), (3029, 86495, 112, 5), (3164, 86496, 117, 5), (3191, 86498, 118, 5), (3380, 86502, 125, 5), (3407, 86503, 126, 5), (3623, 86508, 134, 5), (3731, 86510, 138, 5), (3758, 86511, 139, 5), (3812, 86512, 141, 5), (3947, 86515, 146, 5), (4001, 86518, 148, 5), (4394, 4394, 162, 20), (4514, 86531, 167, 5), (4541, 86532, 168, 5), (4649, 86533, 172, 5), (4676, 86534, 173, 5), (4838, 8

DEBUG:root:('change_values idxlist:', ((167, 86429, 6, 5), 'Lake View', (6, 5)))
DEBUG:root:('change_values idxlist:', ((248, 86433, 9, 5), 'Lincoln Park', (9, 5)))
DEBUG:root:('change_values idxlist:', ((329, 86435, 12, 5), 'Lincoln Park', (12, 5)))
DEBUG:root:('change_values idxlist:', ((356, 86436, 13, 5), 'Lincoln Park', (13, 5)))
DEBUG:root:('change_values idxlist:', ((383, 86438, 14, 5), 'Logan Square', (14, 5)))
DEBUG:root:('change_values idxlist:', ((410, 86439, 15, 5), 'Logan Square', (15, 5)))
DEBUG:root:('change_values idxlist:', ((572, 86444, 21, 5), 'Lake View', (21, 5)))
DEBUG:root:('change_values idxlist:', ((842, 86449, 31, 5), 'Logan Square', (31, 5)))
DEBUG:root:('change_values idxlist:', ((977, 86452, 36, 5), 'Uptown', (36, 5)))
DEBUG:root:('change_values idxlist:', ((1085, 86457, 40, 5), 'Lake View', (40, 5)))
DEBUG:root:('change_values idxlist:', ((1220, 86458, 45, 5), 'Grand Boulevard', (45, 5)))
DEBUG:root:('change_values idxlist:', ((1355, 86459, 50, 5), 'Hyde P

DEBUG:root:('change_values idxlist:', ((11480, 86720, 425, 5), 'Grand Boulevard', (425, 5)))
DEBUG:root:('change_values idxlist:', ((11939, 86736, 442, 5), 'Ohare', (442, 5)))
DEBUG:root:('change_values idxlist:', ((11966, 86737, 443, 5), 'Ohare', (443, 5)))
DEBUG:root:('change_values idxlist:', ((12047, 86739, 446, 5), 'Uptown', (446, 5)))
DEBUG:root:('change_values idxlist:', ((12155, 86744, 450, 5), 'Logan Square', (450, 5)))
DEBUG:root:('change_values idxlist:', ((12236, 86749, 453, 5), 'Garfield Ridge', (453, 5)))
DEBUG:root:('change_values idxlist:', ((12263, 86750, 454, 5), 'Garfield Ridge', (454, 5)))
DEBUG:root:('change_values idxlist:', ((12371, 86757, 458, 5), 'Loop', (458, 5)))
DEBUG:root:('change_values idxlist:', ((12533, 86765, 464, 5), 'Douglas', (464, 5)))
DEBUG:root:('change_values idxlist:', ((12560, 86767, 465, 5), 'Hyde Park', (465, 5)))
DEBUG:root:('change_values idxlist:', ((12587, 86768, 466, 5), 'Grand Boulevard', (466, 5)))
DEBUG:root:('change_values idxlist:'

DEBUG:root:('change_values idxlist:', ((23198, 87016, 859, 5), 'Lower West Side', (859, 5)))
DEBUG:root:('change_values idxlist:', ((23225, 87017, 860, 5), 'Lower West Side', (860, 5)))
DEBUG:root:('change_values idxlist:', ((23387, 87023, 866, 5), 'Woodlawn', (866, 5)))
DEBUG:root:('change_values idxlist:', ((23441, 87025, 868, 5), 'South Shore', (868, 5)))
DEBUG:root:('change_values idxlist:', ((23603, 87027, 874, 5), 'Brighton Park', (874, 5)))
DEBUG:root:('change_values idxlist:', ((23792, 87034, 881, 5), 'Grand Boulevard', (881, 5)))
DEBUG:root:('change_values idxlist:', ((24008, 87039, 889, 5), 'Lincoln Square', (889, 5)))
DEBUG:root:('change_values idxlist:', ((24062, 87041, 891, 5), 'Logan Square', (891, 5)))
DEBUG:root:('change_values idxlist:', ((24116, 87043, 893, 5), 'Grand Boulevard', (893, 5)))
DEBUG:root:('change_values idxlist:', ((24143, 87044, 894, 5), 'Lincoln Park', (894, 5)))
DEBUG:root:('change_values idxlist:', ((24197, 87046, 896, 5), 'Lake View', (896, 5)))
DEB

DEBUG:root:('change_values idxlist:', ((33944, 87329, 1257, 5), 'Ohare', (1257, 5)))
DEBUG:root:('change_values idxlist:', ((33971, 87330, 1258, 5), 'Woodlawn', (1258, 5)))
DEBUG:root:('change_values idxlist:', ((34511, 87343, 1278, 5), 'Rogers Park', (1278, 5)))
DEBUG:root:('change_values idxlist:', ((34646, 87347, 1283, 5), 'Lincoln Park', (1283, 5)))
DEBUG:root:('change_values idxlist:', ((34700, 87350, 1285, 5), 'Lake View', (1285, 5)))
DEBUG:root:('change_values idxlist:', ((34727, 87351, 1286, 5), 'Uptown', (1286, 5)))
DEBUG:root:('change_values idxlist:', ((34781, 87352, 1288, 5), 'Loop', (1288, 5)))
DEBUG:root:('change_values idxlist:', ((35051, 87357, 1298, 5), 'Hyde Park', (1298, 5)))
DEBUG:root:('change_values idxlist:', ((35375, 87363, 1310, 5), 'Loop', (1310, 5)))
DEBUG:root:('change_values idxlist:', ((35456, 87364, 1313, 5), 'Loop', (1313, 5)))
DEBUG:root:('change_values idxlist:', ((35483, 87366, 1314, 5), 'Lincoln Park', (1314, 5)))
DEBUG:root:('change_values idxlist:'

True

In [74]:
workset_before = workset_after.copy()

In [75]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==2]

Unnamed: 0,0,1,2,3,4
88788,88788,721,2,1,721
88789,88789,748,2,1,748
88790,88790,1423,2,1,1423
88791,88791,1450,2,1,1450
88792,88792,1477,2,1,1477
...,...,...,...,...,...
89137,89137,39871,2,1,39871
89138,89138,39979,2,1,39979
89139,89139,40006,2,1,40006
89140,89140,40033,2,1,40033


# cleanup latitude and longitude
The latitude and longitude values in the dataset must fall within the range of -90 to +90 for latitude and -180 to +180 for longitude to ensure that they meet the criteria for analysis. We have provided a check number function to validate the latitude and longitude columns. Any values outside of these ranges should be cleaned to meet the criteria.

If you are unsure what to do with a value or if it is a null value, you can flag the row for deletion by setting latitude_flag or longitude_flag to 1 or 2, respectively.

In [76]:
def check_number(x,start=-90,end=90):
    try:
        temp_x = float(x)
        if start <= temp_x <= end:
            return True
    except:
        return False

In [77]:
import re
def clean_latitude(df):
    #raise Exception("not yet have implementation")
    # do something here
    lat_mask = df['latitude'].apply(lambda x: not check_number(x, -90, 90))
    df.loc[lat_mask, 'latitude_flag'] = 1
#     df.loc[lat_mask, 'delete_flag'] = 1
    df.loc[df['latitude'].isna(), 'latitude_flag'] = 2
    # nikolaus changed clean_latitude implementation to be
    df.loc[df['latitude_flag']==1, 'latitude'] = df.loc[df['latitude_flag']==1, 'latitude'].apply(lambda x: clean_latitude_pls(x))
    # convert the latitude_flag back
    df.loc[df['latitude_flag']==1, 'latitude_flag'] = 0
    return df

In [78]:
def clean_latitude_pls(latitude_string):
    # nikolaus: add minus sign (since range can be - to +)
    latitude_string = re.sub("[^0-9.\-]", "", latitude_string)
    # convert the cleaned latitude string to a float data type
    latitude = float(latitude_string)
    # if the latitude is greater than 90 (the maximum latitude value), set it to NaN
    if latitude > 90:
        latitude = float('nan')
    return latitude

In [79]:
# apply the clean_latitude function to the 'latitude' column
airbnb_pd = clean_latitude(airbnb_pd)
# airbnb_pd.loc[airbnb_pd['latitude_flag']==1, 'latitude']
#airbnb_pd.loc[airbnb_pd['latitude_flag']==1, 'latitude'] = airbnb_pd.loc[airbnb_pd['latitude_flag']==1, 'latitude'].apply(lambda x: clean_latitude_pls(x))

In [80]:
airbnb_pd.loc[airbnb_pd['latitude_flag']==1, 'latitude']

Series([], Name: latitude, dtype: object)

# Latitude checking

This query should return zero rows once you implement the cleaning process

In [81]:
lat_check_pd = airbnb_pd[airbnb_pd.latitude_flag==0]
lat_check_pd = lat_check_pd[lat_check_pd.latitude.apply(lambda x:check_number(x,-90,90))==False]
lat_check_pd[["id","latitude"]]

Unnamed: 0,id,latitude


In [82]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [83]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                 0
name                               0
host_id                            0
host_name                          0
neighbourhood_group                0
neighbourhood                      0
latitude                          83
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
number_of_reviews_ltm              0
license                            0
id_flag                            0
host_id_flag                       0
neighbourhood_flag                 0
latitude_flag                      1
longitude_flag                     0
minimum_nights_flag                0
number_of_reviews_flag             0
last_review_flag                   0
room_type_flag                     0
d

In [84]:
dcm1.change_df(workset_before,workset_after,"clean_latitude_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(3624, 86509, 134, 6), (5082, 86548, 188, 6), (5163, 86551, 191, 6), (6216, 86580, 230, 6), (6243, 86582, 231, 6), (6378, 86586, 236, 6), (6405, 86589, 237, 6), (6459, 86593, 239, 6), (6567, 86597, 243, 6), (6702, 86602, 248, 6), (7863, 86631, 291, 6), (7890, 86632, 292, 6), (8619, 86654, 319, 6), (8646, 86655, 320, 6), (10023, 86685, 371, 6), (10347, 86690, 383, 6), (10860, 86697, 402, 6), (11130, 86704, 412, 6), (11157, 86707, 413, 6), (11562, 86721, 428, 6), (11616, 86723, 430, 6), (11697, 86729, 433, 6), (11724, 86730, 434, 6), (11751, 86731, 435, 6), (12102, 86743, 448, 6), (12291, 86752, 455, 6), (12372, 86758, 458, 6), (12453, 86762, 461, 6), (13182, 86776, 488, 6), (13587, 86783, 503, 6), (13614, 86786, 504, 6), (13830, 86792, 512, 6), (14493, 14493, 536, 21), (14613, 86807, 541, 6), (14640, 86810, 542, 6), (14829, 86818, 549, 6), (15369, 86827, 569, 6), (15882, 86835, 588, 6), (16989, 86862, 629, 6), (18231, 86900, 675, 6), (19689, 86927,

DEBUG:root:('change_values idxlist:', ((31002, 87228, 1148, 6), 41.91991, (1148, 6)))
DEBUG:root:('change_values idxlist:', ((31191, 87235, 1155, 6), 41.97039, (1155, 6)))
DEBUG:root:('change_values idxlist:', ((31434, 87242, 1164, 6), 41.91646, (1164, 6)))
DEBUG:root:('change_values idxlist:', ((32460, 87269, 1202, 6), 41.95124, (1202, 6)))
DEBUG:root:('change_values idxlist:', ((32487, 87272, 1203, 6), 41.95124, (1203, 6)))
DEBUG:root:('change_values idxlist:', ((32568, 87278, 1206, 6), 41.85792, (1206, 6)))
DEBUG:root:('change_values idxlist:', ((32595, 87280, 1207, 6), 41.85792, (1207, 6)))
DEBUG:root:('change_values idxlist:', ((33432, 87308, 1238, 6), 41.9383437, (1238, 6)))
DEBUG:root:('change_values idxlist:', ((33729, 87323, 1249, 6), 41.96602, (1249, 6)))
DEBUG:root:('change_values idxlist:', ((34458, 87342, 1276, 6), 41.8474879, (1276, 6)))
DEBUG:root:('change_values idxlist:', ((34674, 87349, 1284, 6), 41.87118, (1284, 6)))
DEBUG:root:('change_values idxlist:', ((35619, 873

True

In [85]:
dcm1

<__main__.TransformDCM at 0x7f1212869af0>

In [86]:
workset_before = workset_after.copy()

In [87]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==3]

Unnamed: 0,0,1,2,3,4
89142,89142,167,3,Lake View,86429
89143,89143,248,3,Lincoln Park,86433
89144,89144,329,3,Lincoln Park,86435
89145,89145,356,3,Lincoln Park,86436
89146,89146,383,3,Logan Square,86438
...,...,...,...,...,...
89459,89459,39614,3,Loop,87472
89460,89460,39749,3,Lake View,87476
89461,89461,39884,3,South Shore,87479
89462,89462,39992,3,Loop,87481


In [88]:
def clean_longitude(df):
    #raise Exception("not yet have implementation")
    # do something here
    # validate and clean the longitude column
    lon_mask = df['longitude'].apply(lambda x: not check_number(x, -180, 180))
    df.loc[lon_mask, 'longitude_flag'] = 1
    df.loc[df['longitude'].isna(), 'longitude_flag'] = 2
        
    return df

In [89]:
def clean_longi_pls(longi_string):
    # nikolaus: add - on the regex since it can be positive or negative
    longi_string = re.sub("[^0-9.\-]", "", longi_string)
    longi = float(longi_string)
    return longi

In [90]:
airbnb_pd = clean_longitude(airbnb_pd)
airbnb_pd[airbnb_pd['longitude_flag']==1]['longitude']

61                   -8B7.7T005
77                 -87S.68A87p8
127                n-n8T7.74887
154                p-87.6LC9944
156                -87kH.61d926
                 ...           
1394               -87be.r67212
1422               kH-8f7.71057
1442             -87dn.6707C466
1466               h-87S.71n622
1485    -87.6399U92ak733t8a0121
Name: longitude, Length: 80, dtype: object

In [91]:
airbnb_pd.loc[airbnb_pd['longitude_flag']==1, 'longitude'] = airbnb_pd.loc[airbnb_pd['longitude_flag']==1, 'longitude'].apply(lambda x: clean_longi_pls(x))
airbnb_pd.loc[airbnb_pd['longitude_flag']==1, 'longitude']
# nikolaus change longitude_flag back to 0
airbnb_pd.loc[airbnb_pd['longitude_flag']==1, 'longitude_flag'] = 0

# Longitude checking

This query should return zero rows once you implement the cleaning process

In [92]:
lon_check_pd = airbnb_pd[airbnb_pd.longitude_flag==0]
lon_check_pd = lon_check_pd[lon_check_pd.longitude.apply(lambda x:check_number(x,-180,180))==False]
lon_check_pd[["id","longitude"]]

Unnamed: 0,id,longitude


In [93]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [94]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                 0
name                               0
host_id                            0
host_name                          0
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                         80
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
number_of_reviews_ltm              0
license                            0
id_flag                            0
host_id_flag                       0
neighbourhood_flag                 0
latitude_flag                      0
longitude_flag                    10
minimum_nights_flag                0
number_of_reviews_flag             0
last_review_flag                   0
room_type_flag                     0
d

In [95]:
dcm1.change_df(workset_before,workset_after,"clean_longitude_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(427, 427, 15, 22), (1654, 86470, 61, 7), (2086, 86477, 77, 7), (3436, 86504, 127, 7), (4099, 4099, 151, 22), (4165, 86522, 154, 7), (4219, 86523, 156, 7), (5584, 5584, 206, 22), (6055, 86575, 224, 7), (6379, 86587, 236, 7), (6406, 86590, 237, 7), (6811, 86605, 252, 7), (6838, 86606, 253, 7), (8485, 86646, 314, 7), (8647, 86656, 320, 7), (9133, 86664, 338, 7), (9511, 86671, 352, 7), (9646, 86675, 357, 7), (10024, 86686, 371, 7), (11146, 11146, 412, 22), (11455, 86718, 424, 7), (11875, 11875, 439, 22), (12535, 86766, 464, 7), (12940, 86772, 479, 7), (13345, 86777, 494, 7), (13804, 86790, 511, 7), (13912, 86794, 515, 7), (15019, 86820, 556, 7), (15046, 86822, 557, 7), (15289, 86823, 566, 7), (15316, 86824, 567, 7), (15883, 86836, 588, 7), (16114, 16114, 596, 22), (16423, 86849, 608, 7), (17989, 86893, 666, 7), (18178, 86898, 673, 7), (19204, 86915, 711, 7), (19528, 86921, 723, 7), (19555, 86923, 724, 7), (19717, 86929, 730, 7), (19771, 86931, 732, 7

DEBUG:root:('change_values idxlist:', ((27601, 87138, 1022, 7), -87.58559, (1022, 7)))
DEBUG:root:('change_values idxlist:', ((28546, 87155, 1057, 7), -87.6227, (1057, 7)))
DEBUG:root:('change_values idxlist:', ((28924, 87165, 1071, 7), -87.66357, (1071, 7)))
DEBUG:root:('change_values idxlist:', ((29707, 87185, 1100, 7), -87.63386, (1100, 7)))
DEBUG:root:('change_values idxlist:', ((30991, 30991, 1147, 22), 2, (1147, 22)))
DEBUG:root:('change_values idxlist:', ((31138, 87231, 1153, 7), -87.6584419, (1153, 7)))
DEBUG:root:('change_values idxlist:', ((31165, 87233, 1154, 7), -87.6584419, (1154, 7)))
DEBUG:root:('change_values idxlist:', ((31651, 87246, 1172, 7), -87.68309, (1172, 7)))
DEBUG:root:('change_values idxlist:', ((31759, 87247, 1176, 7), -87.630999, (1176, 7)))
DEBUG:root:('change_values idxlist:', ((31921, 87251, 1182, 7), -87.63481, (1182, 7)))
DEBUG:root:('change_values idxlist:', ((32272, 87262, 1195, 7), -87.65993, (1195, 7)))
DEBUG:root:('change_values idxlist:', ((32542

True

In [96]:
workset_before = workset_after.copy()

In [97]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==4]

Unnamed: 0,0,1,2,3,4
89464,89464,3624,4,41.92748,86509
89465,89465,5082,4,41.93203,86548
89466,89466,5163,4,41.93123,86551
89467,89467,6216,4,41.91334,86580
89468,89468,6243,4,41.69097,86582
...,...,...,...,...,...
89543,89543,37428,4,41.955908,87410
89544,89544,37455,4,42.00246,87411
89545,89545,37779,4,41.881121,87418
89546,89546,38049,4,41.84067,87430


# cleanup room type
The "room_type" column in the dataset should contain one of the values defined in the list of allowed_room_type provided by the authority: ['Entire home/apt', 'Private room', 'Shared room', 'Hotel room']. Any value outside of this list needs to be adjusted to one of the allowed values.

If you are unsure about how to adjust the value or cannot find a suitable value, you can flag the row for deletion by setting the value of room_type_flag to 1. If the "room_type" column has a null value and you cannot decide on an appropriate value, you can set the value of room_type_flag to 2.

In [98]:
allowed_room_type = ['Entire home/apt', 'Private room', 'Shared room', 'Hotel room']

In [99]:
def clean_room_type(df):
    #raise Exception("not yet have implementation")
    # do something here
    df.loc[~df['room_type'].isin(allowed_room_type), 'room_type_flag'] = 1
    df.loc[df['room_type'].isna(), 'room_type_flag'] = 2 
    return df

In [100]:
airbnb_pd = clean_room_type(airbnb_pd)
res = airbnb_pd[airbnb_pd['room_type_flag']==1].groupby('room_type').count()
airbnb_pd['room_type'] = airbnb_pd['room_type'].replace({'Entire home': 'Entire home/apt'})
airbnb_pd[airbnb_pd['room_type_flag']==0]['room_type']
# airbnb_pd[airbnb_pd['room_type_flag']==1]['room_type'] = 'Entire home/apt'
# nikolaus change room_type_flag to 0
airbnb_pd.loc[airbnb_pd['room_type_flag']==1, 'room_type_flag'] = 0

# room_type checking

This query should return zero rows once you implement the cleaning process

In [101]:
room_type_pd = airbnb_pd[airbnb_pd.room_type_flag==0]
room_type_pd = room_type_pd[room_type_pd.room_type.apply(lambda x: x not in allowed_room_type)]
room_type_pd[["id","room_type"]]

Unnamed: 0,id,room_type


In [102]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [103]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                  0
name                                0
host_id                             0
host_name                           0
neighbourhood_group                 0
neighbourhood                       0
latitude                            0
longitude                           0
room_type                         189
price                               0
minimum_nights                      0
number_of_reviews                   0
last_review                         0
reviews_per_month                   0
calculated_host_listings_count      0
availability_365                    0
number_of_reviews_ltm               0
license                             0
id_flag                             0
host_id_flag                        0
neighbourhood_flag                  0
latitude_flag                       0
longitude_flag                      0
minimum_nights_flag                 0
number_of_reviews_flag              0
last_review_flag                    0
room_type_fl

In [104]:
dcm1.change_df(workset_before,workset_after,"clean_room_type_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(89, 86428, 3, 8), (170, 86430, 6, 8), (224, 86431, 8, 8), (359, 86437, 13, 8), (431, 431, 15, 26), (1061, 86455, 39, 8), (1601, 86469, 59, 8), (1871, 86475, 69, 8), (2357, 86482, 87, 8), (2438, 86485, 90, 8), (2492, 86488, 92, 8), (2870, 86492, 106, 8), (3167, 86497, 117, 8), (3221, 86499, 119, 8), (3545, 86505, 131, 8), (3572, 86506, 132, 8), (3815, 86513, 141, 8), (3977, 86516, 147, 8), (4409, 86528, 163, 8), (4841, 86545, 179, 8), (5786, 86570, 214, 8), (5975, 86574, 221, 8), (6191, 86579, 229, 8), (6425, 6425, 237, 26), (6623, 86599, 245, 8), (6650, 86600, 246, 8), (6938, 6938, 256, 26), (7028, 86612, 260, 8), (7055, 86614, 261, 8), (7433, 86623, 275, 8), (7460, 86624, 276, 8), (8540, 86649, 316, 8), (8594, 86653, 318, 8), (8891, 86659, 329, 8), (8972, 86660, 332, 8), (9026, 86662, 334, 8), (9215, 86666, 341, 8), (9566, 86673, 354, 8), (9647, 86676, 357, 8), (9674, 86677, 358, 8), (9701, 86679, 359, 8), (9890, 86683, 366, 8), (10106, 86687, 3

DEBUG:root:('change_values idxlist:', ((89, 86428, 3, 8), 'Entire home/apt', (3, 8)))
DEBUG:root:('change_values idxlist:', ((170, 86430, 6, 8), 'Entire home/apt', (6, 8)))
DEBUG:root:('change_values idxlist:', ((224, 86431, 8, 8), 'Entire home/apt', (8, 8)))
DEBUG:root:('change_values idxlist:', ((359, 86437, 13, 8), 'Entire home/apt', (13, 8)))
DEBUG:root:('change_values idxlist:', ((431, 431, 15, 26), 2, (15, 26)))
DEBUG:root:('change_values idxlist:', ((1061, 86455, 39, 8), 'Entire home/apt', (39, 8)))
DEBUG:root:('change_values idxlist:', ((1601, 86469, 59, 8), 'Entire home/apt', (59, 8)))
DEBUG:root:('change_values idxlist:', ((1871, 86475, 69, 8), 'Entire home/apt', (69, 8)))
DEBUG:root:('change_values idxlist:', ((2357, 86482, 87, 8), 'Entire home/apt', (87, 8)))
DEBUG:root:('change_values idxlist:', ((2438, 86485, 90, 8), 'Entire home/apt', (90, 8)))
DEBUG:root:('change_values idxlist:', ((2492, 86488, 92, 8), 'Entire home/apt', (92, 8)))
DEBUG:root:('change_values idxlist:', 

DEBUG:root:('change_values idxlist:', ((20042, 86944, 742, 8), 'Entire home/apt', (742, 8)))
DEBUG:root:('change_values idxlist:', ((20717, 86953, 767, 8), 'Entire home/apt', (767, 8)))
DEBUG:root:('change_values idxlist:', ((20987, 86960, 777, 8), 'Entire home/apt', (777, 8)))
DEBUG:root:('change_values idxlist:', ((21014, 86962, 778, 8), 'Entire home/apt', (778, 8)))
DEBUG:root:('change_values idxlist:', ((21662, 86974, 802, 8), 'Entire home/apt', (802, 8)))
DEBUG:root:('change_values idxlist:', ((22175, 86987, 821, 8), 'Entire home/apt', (821, 8)))
DEBUG:root:('change_values idxlist:', ((22418, 86992, 830, 8), 'Entire home/apt', (830, 8)))
DEBUG:root:('change_values idxlist:', ((22661, 86999, 839, 8), 'Entire home/apt', (839, 8)))
DEBUG:root:('change_values idxlist:', ((22688, 87000, 840, 8), 'Entire home/apt', (840, 8)))
DEBUG:root:('change_values idxlist:', ((22742, 87003, 842, 8), 'Entire home/apt', (842, 8)))
DEBUG:root:('change_values idxlist:', ((23309, 87020, 863, 8), 'Entire

DEBUG:root:('change_values idxlist:', ((36215, 87383, 1341, 8), 'Entire home/apt', (1341, 8)))
DEBUG:root:('change_values idxlist:', ((36323, 87385, 1345, 8), 'Entire home/apt', (1345, 8)))
DEBUG:root:('change_values idxlist:', ((36809, 87394, 1363, 8), 'Entire home/apt', (1363, 8)))
DEBUG:root:('change_values idxlist:', ((36971, 87396, 1369, 8), 'Entire home/apt', (1369, 8)))
DEBUG:root:('change_values idxlist:', ((37268, 87403, 1380, 8), 'Entire home/apt', (1380, 8)))
DEBUG:root:('change_values idxlist:', ((37700, 87417, 1396, 8), 'Entire home/apt', (1396, 8)))
DEBUG:root:('change_values idxlist:', ((37781, 87419, 1399, 8), 'Entire home/apt', (1399, 8)))
DEBUG:root:('change_values idxlist:', ((37808, 87420, 1400, 8), 'Entire home/apt', (1400, 8)))
DEBUG:root:('change_values idxlist:', ((37835, 87422, 1401, 8), 'Entire home/apt', (1401, 8)))
DEBUG:root:('change_values idxlist:', ((37889, 87424, 1403, 8), 'Entire home/apt', (1403, 8)))
DEBUG:root:('change_values idxlist:', ((37970, 874

True

In [105]:
workset_before = workset_after.copy()

In [106]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==5]

Unnamed: 0,0,1,2,3,4
89548,89548,427,5,2,427
89549,89549,1654,5,-87.7005,86470
89550,89550,2086,5,-87.68878,86477
89551,89551,3436,5,-87.74887,86504
89552,89552,4099,5,2,4099
...,...,...,...,...,...
89633,89633,37645,5,-87.67212,87416
89634,89634,38401,5,-87.71057,87440
89635,89635,38941,5,-87.670747,87455
89636,89636,39589,5,-87.71622,87471


# cleanup minimum_nights and number_of_reviews

The columns "minimum_nights" and "number_of_reviews" should both be integer values. "minimum_nights" should be a value between 1 and the number of days in a year (365), while "number_of_reviews" should be a value between 0 and 999999.

To check if these columns meet the criteria, we have provided a "check_integer" function. Any values that do not meet the criteria should be cleaned to meet the criteria for analysis.

If you are unsure what to do with a value or if it is a null value, you can flag the row for deletion by setting "minimum_nights_flag" or "number_of_reviews_flag" to 1 or 2, respectively.

In [107]:
def check_integer(x,start=0,end=365):
    try:
        temp_x = int(x)
        if start <= temp_x <= end:
            return True
    except:
        return False

In [108]:
def clean_minimum_nights(df):
    #raise Exception("not yet have implementation")
    # do something here
    nights_mask = df['minimum_nights'].apply(lambda x: not check_integer(x))
    df.loc[nights_mask, 'minimum_nights_flag'] = 1
    df.loc[df['minimum_nights'].isna(), 'minimum_nights_flag'] = 2
    return df

In [109]:
airbnb_pd = clean_minimum_nights(airbnb_pd)
airbnb_pd[airbnb_pd['minimum_nights_flag']==1]['minimum_nights']

121      e32
212      n32
230     1L20
270      W32
452      d32
476      3B2
493      500
568      G32
581      3H2
590      F32
592      3m2
637      g32
652      s32
896      o32
904      3S2
931      3J2
1135     3I2
1140     A32
1141     A32
1176     3n2
1228     J32
1258     O32
1270     b32
1402     3k2
Name: minimum_nights, dtype: object

In [110]:
def clean_min_nights_pls(nights_string):
    nights_string = re.sub("[^0-9.]", "", nights_string)
    nights = int(nights_string)
    return nights

In [111]:
airbnb_pd.loc[airbnb_pd['minimum_nights_flag']==1, 'minimum_nights'] = airbnb_pd.loc[airbnb_pd['minimum_nights_flag']==1, 'minimum_nights'].apply(lambda x: clean_min_nights_pls(x))
airbnb_pd.loc[airbnb_pd['minimum_nights_flag']==1, 'minimum_nights']
# nikolaus change minimum_nights_flag 1 to 0
airbnb_pd.loc[airbnb_pd['minimum_nights_flag']==1, 'minimum_nights_flag'] = 0


# Minimum nights checking

This query should return zero rows once you implement the cleaning process

In [112]:
min_check_pd = airbnb_pd[airbnb_pd.minimum_nights_flag==0]
min_check_pd = min_check_pd[min_check_pd.minimum_nights.apply(lambda x:check_integer(x,0,365))==False]
min_check_pd[["id","minimum_nights"]]

Unnamed: 0,id,minimum_nights


In [113]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [114]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                 0
name                               0
host_id                            0
host_name                          0
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                    24
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
number_of_reviews_ltm              0
license                            0
id_flag                            0
host_id_flag                       0
neighbourhood_flag                 0
latitude_flag                      0
longitude_flag                     0
minimum_nights_flag                4
number_of_reviews_flag             0
last_review_flag                   0
room_type_flag                     0
d

In [115]:
dcm1.change_df(workset_before,workset_after,"clean_minimum_nights_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(3277, 86500, 121, 10), (4100, 4100, 151, 23), (5734, 86568, 212, 10), (6220, 86581, 230, 10), (7300, 86619, 270, 10), (12214, 86748, 452, 10), (12862, 86770, 476, 10), (13321, 13321, 493, 10), (15346, 86826, 568, 10), (15697, 86833, 581, 10), (15940, 86837, 590, 10), (15994, 86840, 592, 10), (17209, 86870, 637, 10), (17384, 17384, 643, 23), (17614, 86884, 652, 10), (24202, 87047, 896, 10), (24418, 87053, 904, 10), (25147, 87065, 931, 10), (30128, 30128, 1115, 23), (30655, 87214, 1135, 10), (30790, 87218, 1140, 10), (30817, 87219, 1141, 10), (30992, 30992, 1147, 23), (31762, 87248, 1176, 10), (33166, 87298, 1228, 10), (33976, 87331, 1258, 10), (34300, 87340, 1270, 10), (37864, 87423, 1402, 10)], [32, 2, 32, 120, 32, 32, 32, 500, 32, 32, 32, 32, 32, 2, 32, 32, 32, 32, 2, 32, 32, 32, 2, 32, 32, 32, 32, 32])
DEBUG:root:('change_values idxlist:', ((3277, 86500, 121, 10), 32, (121, 10)))
DEBUG:root:('change_values idxlist:', ((4100, 4100, 151, 23), 2, 

True

In [116]:
workset_before = workset_after.copy()

In [117]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==6]

Unnamed: 0,0,1,2,3,4
89638,89638,89,6,Entire home/apt,86428
89639,89639,170,6,Entire home/apt,86430
89640,89640,224,6,Entire home/apt,86431
89641,89641,359,6,Entire home/apt,86437
89642,89642,431,6,2,431
...,...,...,...,...,...
89831,89831,38510,6,Entire home/apt,87448
89832,89832,38780,6,Entire home/apt,87451
89833,89833,39104,6,Entire home/apt,87458
89834,89834,39455,6,Entire home/apt,87466


In [118]:
def clean_number_of_reviews(df):
    #raise Exception("not yet have implementation")
    # do something here
    reviews_mask = df['number_of_reviews'].apply(lambda x: not check_integer(x,0,999999))
    df.loc[reviews_mask, 'number_of_reviews_flag'] = 1
    df.loc[df['number_of_reviews'].isna(), 'number_of_reviews_flag']=2
    return df

In [119]:
airbnb_pd = clean_number_of_reviews(airbnb_pd)

In [120]:
airbnb_pd[airbnb_pd['number_of_reviews_flag']==1]['number_of_reviews']

8        7i4
21       1H3
27      3B94
34      B152
55       k22
58      2x88
132     J199
147      J43
159     31c3
173      A41
174     2o44
175     2o44
211     14L1
227     O111
290     K432
294     d123
316     11O1
338     17e5
431      u13
432      u13
446     s102
460      3D3
481      p68
518      5 6
545      x27
575      1l3
717      1l9
738      H31
740      5M2
811      2x2
853      n16
880      1p8
909      3M6
964      3W5
1071     4q3
1125     2R0
1149     2M3
1183     1O1
1187     b17
1239     t23
1248     O13
Name: number_of_reviews, dtype: object

In [121]:
airbnb_pd.loc[airbnb_pd['number_of_reviews_flag']==1, 'number_of_reviews'] = airbnb_pd.loc[airbnb_pd['number_of_reviews_flag']==1, 'number_of_reviews'].apply(lambda x: clean_min_nights_pls(x))
airbnb_pd.loc[airbnb_pd['number_of_reviews_flag']==1, 'number_of_reviews']
# nikolaus change number_of_reviews_flag to 0
airbnb_pd.loc[airbnb_pd['number_of_reviews_flag']==1, 'number_of_reviews_flag'] = 0

# Clean number of reviews checking

This query should return zero rows once you implement the cleaning process

In [122]:
min_check_pd = airbnb_pd[airbnb_pd.number_of_reviews_flag==0]
min_check_pd = min_check_pd[min_check_pd.number_of_reviews.apply(lambda x:check_integer(x,1,999999))==False]
min_check_pd[["id","number_of_reviews"]]

Unnamed: 0,id,number_of_reviews


In [123]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [124]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                 0
name                               0
host_id                            0
host_name                          0
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                 41
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
number_of_reviews_ltm              0
license                            0
id_flag                            0
host_id_flag                       0
neighbourhood_flag                 0
latitude_flag                      0
longitude_flag                     0
minimum_nights_flag                0
number_of_reviews_flag             6
last_review_flag                   0
room_type_flag                     0
d

In [125]:
dcm1.change_df(workset_before,workset_after,"clean_number_of_reviews_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(227, 86432, 8, 11), (578, 86445, 21, 11), (740, 86447, 27, 11), (929, 86450, 34, 11), (1496, 86466, 55, 11), (1577, 86468, 58, 11), (3575, 86507, 132, 11), (3980, 86517, 147, 11), (4304, 86524, 159, 11), (4398, 4398, 162, 24), (4682, 86535, 173, 11), (4709, 86538, 174, 11), (4736, 86540, 175, 11), (5586, 5586, 206, 24), (5708, 86564, 211, 11), (6140, 86577, 227, 11), (6423, 6423, 237, 24), (7841, 86629, 290, 11), (7949, 86636, 294, 11), (8543, 86650, 316, 11), (9137, 86665, 338, 11), (11648, 86726, 431, 11), (11675, 86728, 432, 11), (11877, 11877, 439, 24), (12053, 86740, 446, 11), (12431, 86760, 460, 11), (12998, 86775, 481, 11), (13997, 86795, 518, 11), (14726, 86813, 545, 11), (15536, 86831, 575, 11), (19370, 86917, 717, 11), (19937, 86939, 738, 11), (19991, 86943, 740, 11), (21908, 86981, 811, 11), (23042, 87012, 853, 11), (23771, 87033, 880, 11), (24554, 87055, 909, 11), (25890, 25890, 958, 24), (26039, 87087, 964, 11), (28928, 87166, 1071, 

True

In [126]:
workset_before = workset_after.copy()

In [127]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==7]

Unnamed: 0,0,1,2,3,4
89836,89836,3277,7,32,86500
89837,89837,4100,7,2,4100
89838,89838,5734,7,32,86568
89839,89839,6220,7,120,86581
89840,89840,7300,7,32,86619
89841,89841,12214,7,32,86748
89842,89842,12862,7,32,86770
89843,89843,13321,7,500,13321
89844,89844,15346,7,32,86826
89845,89845,15697,7,32,86833


# cleanup last_review

The "last_review" column should be in the format of ISO-date (yyyy-mm-dd). We have provided a "check_date" function to verify the date format.

If a value is outside the date format or is null and you are unsure how to handle it, you can flag the row for deletion by setting the "last_review_flag" to 1 or 2.


In [128]:
from datetime import datetime
def check_date(x,fmt="%Y-%m-%d"):
    try:
        datetime.strptime(x,fmt)
        return True
    except:
        return False

In [129]:
def clean_last_reviews(df):
    #raise Exception("not yet have implementation")
    # do something here
    last_mask = df['last_review'].apply(lambda x: not check_date(x))
    df.loc[last_mask, 'last_review_flag'] = 1
    df.loc[df['last_review'].isna(), 'last_review_flag'] = 2
    return df

In [130]:
def clean_date(date_str):
    # Convert the date string to a datetime object
    dt = datetime.strptime(date_str, '%B %d, %Y')

    # Convert the datetime object to an ISO-format string
    iso_date_str = dt.date().isoformat()
    print(iso_date_str)
    return iso_date_str

In [131]:
airbnb_pd = clean_last_reviews(airbnb_pd)
airbnb_pd.loc[airbnb_pd['last_review_flag']==1,'last_review']=airbnb_pd.loc[airbnb_pd['last_review_flag']==1,'last_review'].apply(clean_date)
airbnb_pd[airbnb_pd['last_review_flag']==1]['last_review']
# nikolaus change last_review_flag back to 0
airbnb_pd.loc[airbnb_pd['last_review_flag']==1, 'last_review_flag'] = "0"

2022-11-28
2022-11-27
2022-08-17
2022-05-01
2022-10-24
2022-12-11
2022-10-02
2022-12-09
2019-10-28
2022-06-08
2022-10-27
2022-12-18
2022-11-27
2022-08-29
2019-12-28
2022-12-06
2022-12-11
2022-11-27
2022-11-27
2022-11-27
2022-11-21
2021-11-15
2022-06-12
2022-11-27
2022-12-05
2022-11-25
2022-10-17
2022-11-25
2022-11-25
2022-06-26
2022-07-30
2022-11-27
2022-10-10
2022-08-01
2022-12-05
2022-12-05
2022-12-08
2022-05-25
2022-11-28
2022-12-11
2022-09-27
2022-11-27
2022-12-09
2022-11-27
2022-11-16
2022-08-22
2022-12-04
2022-07-31
2022-12-16
2022-12-16
2022-11-10
2022-12-04
2022-12-12
2022-12-12
2022-12-18
2022-11-12
2022-10-25
2022-12-10
2022-12-16
2022-12-18
2022-12-17
2022-12-17
2022-11-25
2022-12-01


# Last Review checking

This query should return zero rows once you implement the cleaning process

In [132]:
last_review_check_pd = airbnb_pd[airbnb_pd.last_review_flag==0]
last_review_check_pd = last_review_check_pd[last_review_check_pd.last_review.apply(lambda x:check_date(x))==False]
last_review_check_pd[["id","last_review"]]

Unnamed: 0,id,last_review


In [133]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [134]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                  0
name                                0
host_id                             0
host_name                           0
neighbourhood_group                 0
neighbourhood                       0
latitude                            0
longitude                           0
room_type                           0
price                               0
minimum_nights                      0
number_of_reviews                   0
last_review                        64
reviews_per_month                   0
calculated_host_listings_count      0
availability_365                    0
number_of_reviews_ltm               0
license                             0
id_flag                             0
host_id_flag                        0
neighbourhood_flag                  0
latitude_flag                       0
longitude_flag                      0
minimum_nights_flag                 0
number_of_reviews_flag              0
last_review_flag                  350
room_type_fl

In [135]:
dcm1.change_df(workset_before,workset_after,"clean_last_review_c1","c1")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(66, 86427, 2, 12), (79, 79, 2, 25), (430, 430, 15, 25), (741, 86448, 27, 12), (754, 754, 27, 25), (957, 86451, 35, 12), (970, 970, 35, 25), (1942, 1942, 71, 25), (2509, 2509, 92, 25), (3360, 86501, 124, 12), (3373, 3373, 124, 25), (4021, 4021, 148, 25), (4075, 4075, 150, 25), (4210, 4210, 155, 25), (4993, 4993, 184, 25), (5074, 5074, 187, 25), (5169, 86552, 191, 12), (5182, 5182, 191, 25), (5304, 86553, 196, 12), (5317, 5317, 196, 25), (5709, 86565, 211, 12), (5722, 5722, 211, 25), (5763, 86569, 213, 12), (5776, 5776, 213, 25), (6087, 86576, 225, 12), (6100, 6100, 225, 25), (6235, 6235, 230, 25), (6370, 6370, 235, 25), (6532, 6532, 241, 25), (6667, 6667, 246, 25), (6775, 6775, 250, 25), (6937, 6937, 256, 25), (7086, 86615, 262, 12), (7099, 7099, 262, 25), (7167, 86618, 265, 12), (7180, 7180, 265, 25), (7207, 7207, 266, 25), (7842, 86630, 290, 12), (7855, 7855, 290, 25), (7896, 86633, 292, 12), (7909, 7909, 292, 25), (8085, 86638, 299, 12), (8098,

DEBUG:root:('change_values idxlist:', ((66, 86427, 2, 12), '2022-11-28', (2, 12)))
DEBUG:root:('change_values idxlist:', ((79, 79, 2, 25), '0', (2, 25)))
DEBUG:root:('change_values idxlist:', ((430, 430, 15, 25), 2, (15, 25)))
DEBUG:root:('change_values idxlist:', ((741, 86448, 27, 12), '2022-11-27', (27, 12)))
DEBUG:root:('change_values idxlist:', ((754, 754, 27, 25), '0', (27, 25)))
DEBUG:root:('change_values idxlist:', ((957, 86451, 35, 12), '2022-08-17', (35, 12)))
DEBUG:root:('change_values idxlist:', ((970, 970, 35, 25), '0', (35, 25)))
DEBUG:root:('change_values idxlist:', ((1942, 1942, 71, 25), 2, (71, 25)))
DEBUG:root:('change_values idxlist:', ((2509, 2509, 92, 25), 2, (92, 25)))
DEBUG:root:('change_values idxlist:', ((3360, 86501, 124, 12), '2022-05-01', (124, 12)))
DEBUG:root:('change_values idxlist:', ((3373, 3373, 124, 25), '0', (124, 25)))
DEBUG:root:('change_values idxlist:', ((4021, 4021, 148, 25), 2, (148, 25)))
DEBUG:root:('change_values idxlist:', ((4075, 4075, 150,

DEBUG:root:('change_values idxlist:', ((16090, 16090, 595, 25), 2, (595, 25)))
DEBUG:root:('change_values idxlist:', ((16563, 86853, 613, 12), '2022-06-26', (613, 12)))
DEBUG:root:('change_values idxlist:', ((16576, 16576, 613, 25), '0', (613, 25)))
DEBUG:root:('change_values idxlist:', ((16657, 16657, 616, 25), 2, (616, 25)))
DEBUG:root:('change_values idxlist:', ((16684, 16684, 617, 25), 2, (617, 25)))
DEBUG:root:('change_values idxlist:', ((16765, 16765, 620, 25), 2, (620, 25)))
DEBUG:root:('change_values idxlist:', ((16806, 86856, 622, 12), '2022-07-30', (622, 12)))
DEBUG:root:('change_values idxlist:', ((16819, 16819, 622, 25), '0', (622, 25)))
DEBUG:root:('change_values idxlist:', ((16941, 86861, 627, 12), '2022-11-27', (627, 12)))
DEBUG:root:('change_values idxlist:', ((16954, 16954, 627, 25), '0', (627, 25)))
DEBUG:root:('change_values idxlist:', ((17022, 86864, 630, 12), '2022-10-10', (630, 12)))
DEBUG:root:('change_values idxlist:', ((17035, 17035, 630, 25), '0', (630, 25)))


DEBUG:root:('change_values idxlist:', ((27646, 27646, 1023, 25), 2, (1023, 25)))
DEBUG:root:('change_values idxlist:', ((27700, 27700, 1025, 25), 2, (1025, 25)))
DEBUG:root:('change_values idxlist:', ((27862, 27862, 1031, 25), 2, (1031, 25)))
DEBUG:root:('change_values idxlist:', ((27889, 27889, 1032, 25), 2, (1032, 25)))
DEBUG:root:('change_values idxlist:', ((27997, 27997, 1036, 25), 2, (1036, 25)))
DEBUG:root:('change_values idxlist:', ((28024, 28024, 1037, 25), 2, (1037, 25)))
DEBUG:root:('change_values idxlist:', ((28078, 28078, 1039, 25), 2, (1039, 25)))
DEBUG:root:('change_values idxlist:', ((28105, 28105, 1040, 25), 2, (1040, 25)))
DEBUG:root:('change_values idxlist:', ((28173, 87151, 1043, 12), '2022-08-22', (1043, 12)))
DEBUG:root:('change_values idxlist:', ((28186, 28186, 1043, 25), '0', (1043, 25)))
DEBUG:root:('change_values idxlist:', ((28240, 28240, 1045, 25), 2, (1045, 25)))
DEBUG:root:('change_values idxlist:', ((28375, 28375, 1050, 25), 2, (1050, 25)))
DEBUG:root:('ch

DEBUG:root:('change_values idxlist:', ((35490, 87367, 1314, 12), '2022-12-10', (1314, 12)))
DEBUG:root:('change_values idxlist:', ((35503, 35503, 1314, 25), '0', (1314, 25)))
DEBUG:root:('change_values idxlist:', ((35530, 35530, 1315, 25), 2, (1315, 25)))
DEBUG:root:('change_values idxlist:', ((35544, 87370, 1316, 12), '2022-12-16', (1316, 12)))
DEBUG:root:('change_values idxlist:', ((35557, 35557, 1316, 25), '0', (1316, 25)))
DEBUG:root:('change_values idxlist:', ((35638, 35638, 1319, 25), 2, (1319, 25)))
DEBUG:root:('change_values idxlist:', ((35692, 35692, 1321, 25), 2, (1321, 25)))
DEBUG:root:('change_values idxlist:', ((35746, 35746, 1323, 25), 2, (1323, 25)))
DEBUG:root:('change_values idxlist:', ((35773, 35773, 1324, 25), 2, (1324, 25)))
DEBUG:root:('change_values idxlist:', ((35800, 35800, 1325, 25), 2, (1325, 25)))
DEBUG:root:('change_values idxlist:', ((35827, 35827, 1326, 25), 2, (1326, 25)))
DEBUG:root:('change_values idxlist:', ((35962, 35962, 1331, 25), 2, (1331, 25)))
DE

DEBUG:root:('change_values idxlist:', ((39850, 39850, 1475, 25), 2, (1475, 25)))
DEBUG:root:('change_values idxlist:', ((39877, 39877, 1476, 25), 2, (1476, 25)))
DEBUG:root:('change_values idxlist:', ((39904, 39904, 1477, 25), 2, (1477, 25)))
DEBUG:root:('change_values idxlist:', ((39931, 39931, 1478, 25), 2, (1478, 25)))
DEBUG:root:('change_values idxlist:', ((39958, 39958, 1479, 25), 2, (1479, 25)))
DEBUG:root:('change_values idxlist:', ((39985, 39985, 1480, 25), 2, (1480, 25)))
DEBUG:root:('change_values idxlist:', ((40012, 40012, 1481, 25), 2, (1481, 25)))
DEBUG:root:('change_values idxlist:', ((40039, 40039, 1482, 25), 2, (1482, 25)))
DEBUG:root:('change_values idxlist:', ((40066, 40066, 1483, 25), 2, (1483, 25)))
DEBUG:root:('change_values idxlist:', ((40093, 40093, 1484, 25), 2, (1484, 25)))
DEBUG:root:('change_values idxlist:', ((40120, 40120, 1485, 25), 2, (1485, 25)))


True

In [136]:
workset_before = workset_after.copy()

In [137]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==8]

Unnamed: 0,0,1,2,3,4
89864,89864,227,8,74,86432
89865,89865,578,8,13,86445
89866,89866,740,8,394,86447
89867,89867,929,8,152,86450
89868,89868,1496,8,22,86466
89869,89869,1577,8,288,86468
89870,89870,3575,8,199,86507
89871,89871,3980,8,43,86517
89872,89872,4304,8,313,86524
89873,89873,4398,8,2,4398


In [138]:
pd.DataFrame(dcm1.column_position)

Unnamed: 0,0,1,2,3,4,5
0,0,0,-1,id,-1,-1
1,1,1,-1,name,0,-1
2,2,2,-1,host_id,1,-1
3,3,3,-1,host_name,2,-1
4,4,4,-1,neighbourhood_group,3,-1
5,5,5,-1,neighbourhood,4,-1
6,6,6,-1,latitude,5,-1
7,7,7,-1,longitude,6,-1
8,8,8,-1,room_type,7,-1
9,9,9,-1,price,8,-1


In [139]:
pd.DataFrame(dcm1.state_detail)

Unnamed: 0,0,1,2,3,4
0,-1,{'op': 'initial'},0,-2,
1,0,perturbed_dataset,0,-1,0.0
2,1,clean_duplicate_id_c1,1,0,1.0
3,2,clean_inconsistent_host_id_c1,2,1,1.0
4,3,clean_neighbourhood_c1,3,2,1.0
5,4,clean_latitude_c1,4,3,1.0
6,5,clean_longitude_c1,5,4,1.0
7,6,clean_room_type_c1,6,5,1.0
8,7,clean_minimum_nights_c1,7,6,1.0
9,8,clean_number_of_reviews_c1,8,7,1.0


In [140]:
pd.DataFrame(dcm1.state)

Unnamed: 0,0,1
0,-1,-2
1,0,-1
2,1,0
3,2,1
4,3,2
5,4,3
6,5,4
7,6,5
8,7,6
9,8,7


In [141]:
check_pd = pd.DataFrame(dcm1.cell_values).merge(pd.DataFrame(dcm1.cell),left_on=1,right_on=0). \
    merge(pd.DataFrame(dcm1.column_position),left_on="1_y",right_on=1). \
    merge(pd.DataFrame(dcm1.state_detail),left_on="2_x",right_on=0)


  check_pd = pd.DataFrame(dcm1.cell_values).merge(pd.DataFrame(dcm1.cell),left_on=1,right_on=0). \
  check_pd = pd.DataFrame(dcm1.cell_values).merge(pd.DataFrame(dcm1.cell),left_on=1,right_on=0). \


In [142]:
check_pd

Unnamed: 0,1_x,0_x,1_x.1,2_x,3_x,4_x,0_y,1_y,2_y,0_x.1,1_y.1,2_x.1,3_y,4_y,5,0_y.1,1,2_y.1,3,4
0,0,0,0,-1,25879,-1,0,0,0,0,0,-1,id,-1,-1,-1,{'op': 'initial'},0,-2,
1,27,27,27,-1,37738,-1,27,0,1,0,0,-1,id,-1,-1,-1,{'op': 'initial'},0,-2,
2,54,54,54,-1,189821,-1,54,0,2,0,0,-1,id,-1,-1,-1,{'op': 'initial'},0,-2,
3,81,81,81,-1,207218,-1,81,0,3,0,0,-1,id,-1,-1,-1,{'op': 'initial'},0,-2,
4,108,108,108,-1,220333,-1,108,0,4,0,0,-1,id,-1,-1,-1,{'op': 'initial'},0,-2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90320,39871,89137,39871,2,1,39871,39871,19,1476,19,19,-1,host_id_flag,18,-1,2,clean_inconsistent_host_id_c1,2,1,1.0
90321,39979,89138,39979,2,1,39979,39979,19,1480,19,19,-1,host_id_flag,18,-1,2,clean_inconsistent_host_id_c1,2,1,1.0
90322,40006,89139,40006,2,1,40006,40006,19,1481,19,19,-1,host_id_flag,18,-1,2,clean_inconsistent_host_id_c1,2,1,1.0
90323,40033,89140,40033,2,1,40033,40033,19,1482,19,19,-1,host_id_flag,18,-1,2,clean_inconsistent_host_id_c1,2,1,1.0


In [143]:
dcm1.col_id

27

In [144]:
# write datalog facts
with open("dcm_1.pl","w") as file:
    for x in set(dcm1.column):
        file.write('column("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in set(dcm1.row):
        file.write('row("{}").\n'.format('","'.join([str(y) for y in x])))   
    for x in dcm1.cell:
        file.write('cell("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in dcm1.cell_values:
        file.write('cell_values("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in dcm1.col_dependency:
        file.write('col_dependency("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in dcm1.value_derived_from:
        file.write('value_derived_from("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in dcm1.column_position:
        file.write('column_position("{}").\n'.format('","'.join([str(y) for y in x])))

In [145]:
#save dcmx in pickle
import pickle
with open("dcm_collab.pickle","wb") as file:
    pickle.dump(dcm1,file)