In [1]:
import pandas as pd
import numpy as np
import abydos
#from abydos import distance

In [2]:
def dep_input_output_col(prev_df,now_df,desc):
    new_col = set(now_df.columns) - set(prev_df.columns)
    old_col = set(list(prev_df.columns))
    sort_old_cols = sorted(old_col,key=lambda x:len(x))[::-1]
    sort_new_cols = sorted(new_col,key=lambda x:len(x))[::-1]
    desc = ops["description"]

    import re
    out_cols = []
    in_cols = []
    for x in sort_new_cols:
        sx = r'\b{}\b'.format(x)
        #print(sx,desc)
        xx = re.findall(sx,desc)
        #print(xx)
        for yy in xx:
            desc = desc.replace(yy,"")
        out_cols = out_cols + xx
    for x in sort_old_cols:
        sx = r'\b{}\b'.format(x)
        #print(sx,desc)
        xx = re.findall(sx,desc)
        #print(xx)
        for yy in xx:
            desc = desc.replace(yy,"")
        in_cols = in_cols + xx
    
    out_cols = list(set(out_cols+list(new_col)))
        
    return in_cols,out_cols
    #ops["description"]

#dep_input_output_col(prev_df,now_df,ops["description"])

# General Instruction

- Cleanup dataset based on the information that is given:
You need to clean the dataset according to the information that is given to you. This means that there are problems with the dataset that need to be fixed, and you should use the information given to you to determine what those problems are and how to fix them.

- Each case has different data quality problems, there will be hint and additional information that can help you understand the problem:
Each row in the dataset may have different data quality problems. There will be hints and additional information provided to help you understand what the specific problem is with each row.

- You can do any approach on cleaning the data, but you should clean the instructed column only:
You have the freedom to use any approach to clean the data, but you should only clean the instructed column. This means that you should not modify any other columns in the dataset, or add or remove any rows.

- Do not create new column or remove any column. Also do not create new row, or remove any row:-
You are not allowed to create new columns or remove any columns from the dataset. You are also not allowed to add or remove any rows.

- Each column will have a flag column something equivalent to <column\_name>\_flag. This column can be used to flag the row if you want to not include it to the downstream task. 0: safe_flag, 1: delete_flag, 2: null_flag (if you want to still include the row with null treatment). You can also add a new category but please add justification and explanation of the new category, there are three categories you can use:
safe_flag (0): this row is safe to use in downstream tasks
delete_flag (1): this row should be deleted and not used in downstream tasks
null_flag (2): this row can be included in downstream tasks but with null treatment.
You can also add a new category, but you need to provide a justification and an explanation for the new category. It is worth to note that the completeness of the dataset is also matter, so try not to flag to many things, and do your best to clean the values.

- For each data cleaning task, we have provided a function that represents the goal of the cleaning. For example, clean_duplicate_id(df) is the function for removing duplicate ID values. These functions take a DataFrame as input and return the cleaned version of the DataFrame.

    In each chunk of data cleaning task, you will see the following three parts:

    1. The clean_<name> function that performs the specific cleaning task.
    2. The execution of the cleaning function on the DataFrame.
    3. A checking part to help you evaluate the effectiveness of the cleaning.
    
  While you can create new cells and add additional code, the cleaning must be performed through the provided cleaning functions. You can adjust the order of the cleaning steps, but please try to move the whole chunks of code to avoid any errors.

The cleaning task will be considered complete if this notebook can be run sequentially by executing "restart and runall"




# Purpose
The purpose of this dataset is to conduct exploratory analysis of the listings and create a prediction model for listing price using some columns from the dataset. This means that the dataset is intended to be used to explore the characteristics and features of the listings, and to build a model that can predict the price of a listing based on certain variables in the dataset. The goal is to gain insights into the factors that influence the price of a listing and to develop a model that can accurately predict listing prices based on those factors.

# Columns and Dataset Description
- id: a unique identifier for each listing.
- name: the name or title of the listing, as provided by the host.
- host_id: a unique identifier for each host.
- host_name: the name of the host who listed the property.
- neighbourhood_group: the larger geographic area in which the listing is located (e.g. a borough or group of neighborhoods).
- neighbourhood: the specific neighborhood in which the listing is located.
- latitude: the latitude coordinate of the listing.
- longitude: the longitude coordinate of the listing.
- room_type: the type of space that is being listed (e.g. an entire apartment, a private room, a shared room).
- price: the nightly price of the listing, in the currency specified in the dataset.
- minimum_nights: the minimum number of nights that a guest must book the listing for.
- number_of_reviews: the total number of reviews that the listing has received.
- last_review: the date of the most recent review of the listing.
- reviews_per_month: the average number of reviews per month that the listing has received.
- calculated_host_listings_count: the total number of listings that the host has on Airbnb.
- availability_365: the number of days per year that the listing is available for booking.
- number_of_reviews_ltm: the total number of reviews that the listing has received in the last 12 months.
- license: a license number for the listing, if applicable (this column may not be present in all versions of the dataset).

Besides the columns above, there are columns pre-defined for flagging the rows based on particular data cleaning context:
- id_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the id column (duplicate).
- host_id_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the host_id column.
- neighbourhood_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the neighbourhood column.
- latitude_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the latitude column.
- longitude_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the longitude column.
- minimum_nights_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the minimum_nights column.
- number_of_reviews_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the number_of_reviews column.
- last_review_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the last_review column.
- room_type_flag: a flag column indicating whether a given row should be included in downstream analysis or not based on data quality issues related to the room_type column.

In [3]:
# transform dcm
import logging 
import pandas as pd

logging.basicConfig(level=logging.DEBUG)
#logging.basicConfig(level=logging.INFO)

class TransformDCM():
    def __init__(self,trace):
        self.trace = trace
        self.source = []
        self.dataset = []
        self.state = []
        self.array = []
        self.column = []
        self.row = []
        self.cell = []
        self.cell_values = []
        self.column_position = []
        self.row_position = []
        self.user = {None: -1}
        
        self.value_derived_from = []        
        self.col_derived_from = []
        self.state_derived_from = []
        self.col_dependency = []
        self.state_detail = []
        
        self.pd_index = None
        
        self.source_id = 0
        self.dataset_id = 0
        self.array_id = 0
        self.col_id = 0
        self.row_id = 0
        self.cell_id = 0
        self.value_id = 0
        self.state_id = -1
        self.col_pos_id = 0
        self.row_pos_id = 0
        self.user_id = 0
        self.execution_id = 0
        self.prev_state_id = -2        
        
        self.col_names_coll = set()
        
        
        self.curr_df = None
        self.curr_col = None
        self.curr_row = None
        self.curr_index = None
        
        self.curr_row_pos = {}
        self.curr_col_pos = {}
        self.curr_col_schema = []
        self.curr_row_list = []
        
        self.curr_state = 0            
        
    
    def render_curr_df(self):
        pass
    
    def render_col(self):
        pass
    
    def render_row(self):
        pass
        
    
    def init_df(self,df):
        self.pd_index = pd.DataFrame(np.empty(df.shape),dtype=object)    
        self.col_names = df.columns
        
        for i,x in enumerate(df.to_records()):
            jj = 0
            #print(i)
            for j,y in enumerate(x):
                #print(j)
                if j==0:
                    continue
                
                self.cell.append((self.cell_id,jj,self.row_id))
                #print(y)
                #self.cell_values.append((self.value_id,self.cell_id,self.state_id,y[0],-1))
                self.cell_values.append((self.value_id,self.cell_id,self.state_id,y,-1))
                #print(self.pd_index.loc[i,jj])
                self.pd_index.loc[i,jj] = (self.cell_id,self.value_id,i,jj) 
                
                self.value_derived_from.append((self.cell_id,self.state_id,-1))

                self.cell_id+=1
                self.value_id+=1
                if i == 0:
                    if jj == 0:
                        prev_j = -1
                    
                    
                    self.column.append((self.col_id,self.array_id))                    
                    self.column_position.append((self.col_pos_id,self.col_id,self.state_id,self.col_names[jj],prev_j,-1))
                    self.col_names_coll.add(self.col_names[jj])
                    self.curr_col_schema.append((self.col_names[jj],jj,self.col_id,prev_j))
                    self.curr_col_pos[self.col_id] = (self.col_pos_id,prev_j)
                    prev_j = jj
                    self.col_pos_id+=1                    
                    self.col_id+=1    
                jj+=1
            if i == 0:
                prev_i = -1
                
            self.row.append((self.row_id,self.array_id))            
            self.row_position.append((self.row_pos_id,self.row_id,self.state_id,prev_i,-1))
            self.curr_row_pos[self.row_id] = (self.row_pos_id,prev_i)
            self.curr_row_list.append(self.row_pos_id)
            prev_i = self.row_id
            self.row_id+=1       
            self.row_pos_id+=1            
        
        """            
        col_id = np.where(df.columns==col.name)[0][0]
        columns.append((col_id,self.array_id))
        #temp_col = []
        for i,x in enumerate(col):
            if not row_processed:
                rows.append((i,array_id))
            #temp_col.append((cell_id,col_id,i))
            cells.append((cell_id,col_id,i))
            cell_values.append((value_id,cell_id,state_id,x,-1))
            pd_index.loc[i,col_id] = (cell_id,value_id,col_id,i)        
            cell_id+=1
            value_id+=1
            #print(i,col_id)
        row_processed = True        
        """
    
    def init_dataset(self,tt):
        logging.debug("init dataset")
        # get filename from trace        
        df = tt[5]
        code = tt[6]
        self.source.append((self.source_id,code,"dataframe"))
        self.dataset.append((self.dataset_id,self.source_id))
        self.array.append((self.array_id,self.dataset_id))
        # state creation
        
        prev_state_id = self.state_id
        self.state_id+=1        
        self.state.append((self.state_id,prev_state_id))            
        
        # generate column, row, cell
        self.init_df(df)
        self.state_id+=1        

        
        #self.state_id+=1
        #self.array_id+=1
        #self.dataset_id+=1
        #self.source_id+=1
        
    def init_dataset_df(self,df,state_ss={'op':"initial"},fname=None):
        logging.debug("init dataset")
        self.source.append((self.source_id,fname,"dataframe"))
        self.dataset.append((self.dataset_id,self.source_id))
        self.array.append((self.array_id,self.dataset_id))
        # state creation
                
        # generate column, row, cell
        self.init_df(df)       


        self.state.append((self.state_id,self.prev_state_id))
        #self.state_detail.append((prev_state_id,state_ss))
        self.state_detail.append((self.state_id,state_ss,self.execution_id,self.prev_state_id,None))      
        
        self.prev_state_id = self.state_id
        self.state_id+=1

    
    def change_column_schema(self,prev_df,now_df):
        #old_col = list(self.curr_col)  
        old_col = list(prev_df.columns)
        now_col = list(now_df.columns)
        #logging.debug(now_df)
        #logging.debug(("old new col",old_col,self.curr_col_schema,now_col))
        new_col = set(now_col)-set(old_col)
        ocol = set(old_col) - set(now_col)
        
        logging.debug(("change_col_schema old_col",old_col,self.curr_col_schema,now_col))
        logging.debug(("change_col_schema new_col",new_col,ocol))
        
        old_schema = [x[0] for x in self.curr_col_schema]
        
        temp_new_col = []
        
        temp_prev = None
        
        logging.debug(("change_col_schema old_schema:",old_schema))
                
        for n_idx,x in enumerate(now_col):
            logging.debug("change_col_schema now_col:")
            try:
                idx_schema = old_schema.index(x)
            except:
                # check the potential column
                for y in ocol:                    
                    test = self.curr_df.loc[:,y].fillna(0) == now_df.loc[:,x].fillna(0)
                    logging.debug(("test",test.sum(),self.curr_df.shape[0],self.curr_df,now_df))
                    if test.sum() == self.curr_df.shape[0]:
                        idx = old_col.index(y)
                        prev_idx = (-1,None)
                        logging.debug(("idx_test",idx))
                        if idx > 0:
                            prev_idx = old_col[idx-1]
                            idx_n = now_col.index(x)
                            now_idx = self.curr_col_schema[idx_n]
                            prev_col_schema = (None,None,-1,None)
                            if idx_n > 0:
                                prev_col_schema = self.curr_col_schema[idx_n-1]
                            logging.debug(("change_col_schema now_idx:",now_idx,prev_idx))
                            self.column_position.append((self.col_pos_id,now_idx[-2],self.state_id,x,prev_col_schema[2],now_idx[1]))
                            logging.debug(("change_col_schema adding_col_pos1:",(self.col_pos_id,now_idx[-2],self.state_id,x,n_idx-1,now_idx[2])))
                            temp_new_col.append((x,self.col_pos_id,now_idx[2],now_idx[3]))
                            #temp_new_col.append((x,self.col_pos_id,now_idx[2]))
                            #temp_new_col.append((x,self.col_pos_id,-1))
                            self.col_pos_id+=1  
                            
                            logging.debug(("change_col_schema now_idx",now_idx))
                            self.col_dependency.append((self.state_id,now_idx[-2] if len(now_idx)==4 else now_idx[-1],now_idx[-2] if len(now_idx)==4 else now_idx[-1]))                                                                      
                            break
                continue
                                        
            old_schema_idx = self.curr_col_schema[idx_schema]
            idx = idx_schema
            #idx = old_col.index(x)
            #prev_idx = (-1,None)
            prev_idx = -1
            prev_old_schema = (None,None,None,None)
            if idx > 0:
                prev_idx = idx - 1
                prev_old_schema = self.curr_col_schema[prev_idx]            

            prev_nidx = -1
            prev_new_schema = (None,None,None,None)
            if n_idx > 0:     
                prev_nidx = n_idx - 1
                prev_new_schema = temp_new_col[-1]
                
            #logging.debug(("change_col_schema old",n_idx,x,idx,prev_idx,self.curr_col_schema[idx_schema]))
            #logging.debug(("change_col_schema cur_col_schema:",self.curr_col_schema[n_idx][0],x))
            logging.debug(("change_col_schema prev next:",prev_old_schema,prev_new_schema))
            
            #if self.curr_col_schema[n_idx][0] != x:
            if prev_old_schema[2] != prev_new_schema[2]:
                #if prev_idx[1] != prev_nidx[1]: 
                #if prev_idx[0] != prev_nidx[1]:         
                #logging.debug(("tempnewcol:",temp_new_col[n_idx-1], now_idx))
                #if temp_new_col[n_idx-1][2] != now_idx[2]:
                """
                if temp_new_col[n_idx-1][2] != now_idx[2]:
                    idx_n = now_col.index(x)
                    self.column_position.append((self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2]))
                    logging.debug(("adding_col_pos2:",(self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2])))                            
                    #temp_new_col.append((x,self.col_pos_id,now_idx[1],now_idx[2],temp_prev))
                    temp_new_col.append((x,self.col_pos_id,now_idx[2],temp_prev))
                    self.col_pos_id+=1
                """
                self.column_position.append((self.col_pos_id,old_schema_idx[1],self.state_id,x,prev_new_schema[1],old_schema_idx[1]))
                #logging.debug(("adding_col_pos2:",(self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2])))                            
                #temp_new_col.append((x,self.col_pos_id,now_idx[1],now_idx[2],temp_prev))
                temp_new_col.append((x,self.col_pos_id,old_schema_idx[1],prev_new_schema[1]))
                self.col_pos_id+=1
                #aaa
            else:
                 #temp_new_col.append((x,now_idx[1],now_idx[2],now_idx[3]))
                temp_new_col.append(self.curr_col_schema[idx_schema])
        
        
        logging.debug(("change_col_schema temp_new_col:",temp_new_col))
        
        self.curr_col_schema = temp_new_col
                
        return None
        
        for n_idx,x in enumerate(now_col):
            try:
                idx_schema = old_schema.index(x)
            except:
                # check the potential column
                for y in ocol:                    
                    test = self.curr_df.loc[:,y].fillna(0) == now_df.loc[:,x].fillna(0)
                    logging.debug(("test",test.sum(),self.curr_df.shape[0],self.curr_df,now_df))
                    if test.sum() == self.curr_df.shape[0]:
                        idx = old_col.index(y)
                        prev_idx = (-1,None)
                        logging.debug(("idx_test",idx))
                        if idx > 0:
                            prev_idx = old_col[idx-1]
                            idx_n = now_col.index(x)
                            now_idx = self.curr_col_schema[idx_n]  
                            logging.debug(("now_idx:",now_idx,prev_idx))
                            self.column_position.append((self.col_pos_id,now_idx[-2],self.state_id,x,n_idx-1,now_idx[2]))
                            logging.debug(("adding_col_pos1:",(self.col_pos_id,now_idx[-2],self.state_id,x,n_idx-1,now_idx[2])))
                            temp_new_col.append((x,self.col_pos_id,now_idx[2],now_idx[3]))
                            #temp_new_col.append((x,self.col_pos_id,now_idx[2]))
                            #temp_new_col.append((x,self.col_pos_id,-1))
                            self.col_pos_id+=1  
                            
                            logging.debug(("now_idx",now_idx))
                            self.col_dependency.append((self.state_id,now_idx[-2] if len(now_idx)==4 else now_idx[-1],now_idx[-2] if len(now_idx)==4 else now_idx[-1]))                              
                continue
            
            old_schema_idx = self.curr_col_schema[idx_schema]
            
            idx = old_col.index(x)
            prev_idx = (-1,None)
            next_idx = None
            if idx > 0:
                prev_idx = old_col[idx-1]
            #if idx < len(old_col)-1:
            #    next_idx = old_col[idx+1]

            logging.debug(("change_schema",n_idx,x,idx,prev_idx,self.curr_col_schema[idx_schema]))
                        
            now_idx = self.curr_col_schema[idx_schema]    
            
            prev_nidx = (-1,None)
            if n_idx > 0:                
                prev_nidx = (n_idx-1,now_col[n_idx-1])                        
                prev_idx = self.curr_col_schema[n_idx-1]
            
            #logging.debug((self.curr_col_pos[old_schema_idx[2]],now_idx))
            
            
            if temp_prev == None:
                temp_prev = -1
            else:
                temp_prev = now_idx[-1]
                
            #if self.curr_col_pos[old_schema_idx[2]][1] != now_idx[3]:
            #print(now_idx,temp_new_col[n_idx-1])
            logging.debug(("prev_idx,nidx:",prev_idx,prev_nidx,temp_new_col,n_idx))
            if prev_idx[1] != prev_nidx[1]:         
                #if prev_idx[0] != prev_nidx[1]:         
                logging.debug(("tempnewcol:",temp_new_col[n_idx-1], now_idx))
                #if temp_new_col[n_idx-1][2] != now_idx[2]:
                if temp_new_col[n_idx-1][2] != now_idx[2]:
                    idx_n = now_col.index(x)
                    self.column_position.append((self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2]))
                    logging.debug(("adding_col_pos2:",(self.col_pos_id,now_idx[-2],self.state_id,x,temp_new_col[n_idx-1][2],now_idx[2])))                            
                    #temp_new_col.append((x,self.col_pos_id,now_idx[1],now_idx[2],temp_prev))
                    temp_new_col.append((x,self.col_pos_id,now_idx[2],temp_prev))
                    self.col_pos_id+=1
            else:
                 temp_new_col.append((x,now_idx[1],now_idx[2],now_idx[3]))
            
                                
            """
            #if idx_n > 0:
            prev_idx_n = -1
            try:
                prev_idx_n = now_col[idx_n-1]
            except:
                pass
            logging.debug(("test column",prev_idx,prev_idx_n))

            if prev_idx is not None and prev_idx[0]!=prev_idx_n:
                self.column_position.append((self.col_pos_id,prev_idx[0],self.state_id,x,prev_idx_n,prev_idx[1]))
                self.col_pos_id+=1        
            #self.curr_col_schema.insert(idx,(x,self.col_id))            
            self.col_names_coll.add(x)
            """
            
        self.curr_col_schema = temp_new_col
            
        pass
    
    def change_row_position(self,prev_df,now_df):
        old_row = list(self.curr_row)
        now_row = list(now_df.index)
        #new_row = set(now_row)-set(old_row)
        
        temp_cur_row_pos = []
        for x in now_row:
            #logging.debug((x))
            temp_cur_row_pos.append(self.curr_row_pos[x])            
        
        logging.debug(temp_cur_row_pos)
        pass
    
    def add_column(self,prev_df,df,state_ss=None):   
        #old_col = self.curr_col
        old_col = prev_df.columns
        now_col = df.columns
        new_col = set(now_col)-set(old_col)
        logging.debug(("new_col:",new_col,self.curr_col_schema))
        
        # sort new_col by the index
        new_col = list(filter(lambda x:x in new_col,now_col))

        in_col,out_col = dep_input_output_col(prev_df,df,state_ss["operation"]["description"])
        #print("in_col,out_col",in_col,out_col)
        
        old_col_schema = self.curr_col_schema.copy()
        
        for j,x in enumerate(new_col):
            last_col = self.curr_col_schema[-1][1]
            new_col_val = df.loc[:,[x]]
            logging.debug(new_col_val.values.tolist())
            idx = list(now_col).index(x)
                
            logging.debug(idx)
            if idx > 0:
                prev_idx = self.curr_col_schema[idx-1]
            
            self.column_position.append((self.col_pos_id,self.col_id,self.state_id,x,prev_idx[2],-1))       
            logging.debug(("curr_col_schema",self.curr_col_schema))
            #self.curr_col_schema.insert(idx,(x,self.col_pos_id,self.col_id,self.curr_col_schema[-1][3] if len(self.curr_col_schema[-1])==4 else self.curr_col_schema[-1][-1]))
            self.curr_col_schema.insert(idx,(x,self.col_pos_id,self.col_id,self.curr_col_schema[-1][2]))
            self.curr_col_pos[self.col_id] = (self.col_pos_id,last_col)
            self.col_names_coll.add(x)            
            
            # add values
            self.column.append((self.col_id,self.array_id))
            #self.cell_values.append()
            temp_idx = []
            for y,i in zip(new_col_val.values.tolist(),self.curr_row):
            #for y,i in zip(new_col_val.values.tolist(),self.curr_row):
                self.cell.append((self.cell_id,self.col_id,i))
                self.cell_values.append((self.value_id,self.cell_id,self.state_id,y[0][0] if y[0]!=None else y[0],-1))
                #print(self.pd_index.loc[i,jj])
                #self.pd_index.loc[i,jj] = (self.cell_id,self.value_id,i,jj) 
                temp_idx.append([(self.cell_id,self.value_id,i,self.col_id)])
                
                # add linkage to the derived by cell
                # lookup value id for input col_value
                logging.debug(("in_col:",in_col,self.curr_col_schema))
                for z in in_col:                    
                    zzl = [x[-2] if len(x)==4 else x[-1] for x in list(filter(lambda x:(x[0]==z),self.curr_col_schema))]
                    logging.debug(("zzl:",zzl))
                    for zz in zzl:                        
                        op_cid = list(filter(lambda x:(x[1]==zz)&(x[2]==i),self.cell))
                        logging.debug(("op_cid:",op_cid))
                        for zzz in op_cid:
                            op_val = list(filter(lambda x:x[1]==zzz[0],sorted(self.cell_values,key=lambda x:x[0])))[::-1]
                            logging.debug(("op_val:",op_val))
                            for v in op_val:
                                self.value_derived_from.append((self.value_id,self.state_id,v[0]))
                                break
                                            
                #filter(lambda x:,self.cell_values)
                
                self.value_id+=1
                self.cell_id+=1
            
            #self.pd_index[self.col_id] = pd.DataFrame(temp_idx)
            self.pd_index.insert(loc=idx,column=self.col_id,value=pd.DataFrame(temp_idx)[0].tolist())
            
            #self.curr_col_pos[self.col_id] = (self.col_pos_id,prev_j)

            for z in in_col:
                zzl = [x[-2] for x in list(filter(lambda x:(x[0]==z),self.curr_col_schema))]
                for zz in zzl:                        
                    self.col_dependency.append((self.state_id,self.col_id,zz))                                            
            
            #prev_j = j
            self.col_pos_id+=1                    
            self.col_id+=1
        
        # normalize column position
        temp_new_col = []
        for n_idx,x in enumerate(self.curr_col_schema):
            filtered_col = list(filter(lambda y: y[2] == x[2],old_col_schema))
            if len(filtered_col)>0:
                idx = [y[2] for y in old_col_schema].index(x[2])
                prev_idx = -1
                prev_old_schema = (None,None,None,None)
                if idx > 0:
                    prev_idx = idx - 1
                    prev_old_schema = old_col_schema[prev_idx]
                old_schema_idx = old_col_schema[idx]
                
                prev_nidx = -1
                prev_new_schema = (None,None,None,None)
                if n_idx > 0:     
                    prev_nidx = n_idx - 1
                    prev_new_schema = self.curr_col_schema[prev_nidx]

                if prev_old_schema[2] != prev_new_schema[2]:                
                    self.column_position.append((self.col_pos_id,old_schema_idx[1],self.state_id,x[0],prev_new_schema[1],old_schema_idx[1]))
                    
                    temp_new_col.append((x[0],self.col_pos_id,old_schema_idx[1],prev_new_schema[1]))
                    self.col_pos_id+=1                
                else:
                    temp_new_col.append(old_col_schema[idx])
            else:
                temp_new_col.append(self.curr_col_schema[n_idx])
        
        logging.debug(("add column",temp_new_col))
        
        self.curr_col_schema = temp_new_col
                        
        #len(now_col)>len(prev_col)
        pass
    
    def remove_column(self,prev_df,df):
        #old_col = self.curr_col
        old_col = prev_df.columns
        curr_col = self.curr_col_schema
        now_col = df.columns
        
        removed = set(old_col) - set(now_col)
        logging.debug(("remove_column: ",removed))
        for x in removed:
            idx = list(old_col).index(x)

            #logging.debug(idx)
            prev_idx = None
            next_idx = None
            if idx > 0:
                prev_idx = self.curr_col_schema[idx-1]
            if idx < len(old_col)-1:
                next_idx = self.curr_col_schema[idx+1]
            
            self.column_position.append((self.col_pos_id,self.curr_col_schema[idx][1],self.state_id,x,-2,self.curr_col_schema[idx][1]))
            self.curr_col_pos[idx] = (self.col_pos_id,-2)
            old_col = self.curr_col_schema.pop(idx)
            
            
            self.col_pos_id+=1          
            if next_idx is not None:
                self.column_position.append((self.col_pos_id,next_idx[1],self.state_id,x,prev_idx[1],next_idx[0]))                
                self.col_pos_id+=1                                        
        pass
    
    def add_row(self,df):
        old_row = list(self.curr_row)
        now_row = list(df.index)
        new_row = set(now_row)-set(old_row)           
        logging.debug((old_row,now_row,new_row))
        
        
        for x in new_row:
            new_row_val = df.loc[[x],:]
            logging.debug(("new row",new_row_val.values.tolist(),self.curr_col_schema))
        
            for y,i in zip(new_row_val.values.tolist()[0],self.curr_col_schema):
                #print(i)
                self.cell.append((self.cell_id,i[1],self.row_id))
                self.cell_values.append((self.value_id,self.cell_id,self.state_id,y[0],-1))
                self.cell_id+=1
                self.value_id+=1
            
            self.row.append((self.row_id,self.array_id))            
            
            if self.curr_row_pos[self.row_id-1][1]!=-2:
                self.curr_row_pos[self.row_id] = (self.row_pos_id,self.row_id-1)
                self.row_position.append((self.row_pos_id,self.row_id,self.state_id,self.row_id-1,-1))
            else:
                temp_prev_row = list(filter(lambda x:(x[1]==self.row_id-1)&(x[3]!=-2),self.row_position))
                if len(temp_prev_row)>0:
                    self.curr_row_pos[self.row_id] = (self.row_pos_id,temp_prev_row[-1][3])
                    self.row_position.append((self.row_pos_id,self.row_id,temp_prev_row[-1][3],-1))

                else: 
                    self.curr_row_pos[self.row_id] = (self.row_pos_id,self.row_id-1)
            
            self.row_id+=1    
            self.row_pos_id+=1
                
            """
            idx = list(now_col).index(x)
                
            logging.debug(idx)
            if idx > 0:
                prev_idx = self.curr_col_schema[idx-1]
            
            self.column_position.append((self.col_pos_id,self.col_id,self.state_id,x,prev_idx[1],-1))
            
            self.curr_col_schema.insert(idx,(x,self.col_id))
            
            self.col_names_coll.add(x)
            
            
            # add values
            self.column.append((self.col_id,self.array_id))
            #self.cell_values.append()
            temp_idx = []
            for y,i in zip(new_col_val.values.tolist(),self.curr_row):
                self.cell.append((self.cell_id,self.col_id,i))
                self.cell_values.append((self.value_id,self.cell_id,self.state_id,y[0],-1))
                #print(self.pd_index.loc[i,jj])
                #self.pd_index.loc[i,jj] = (self.cell_id,self.value_id,i,jj) 
                temp_idx.append([(self.cell_id,self.value_id,i,self.col_id)])
                self.value_id+=1
                self.cell_id+=1
            
            self.pd_index[self.col_id] = pd.DataFrame(temp_idx)

            #prev_j = j
            self.col_pos_id+=1                    
            self.col_id+=1             
            """
        
        
        pass

    def remove_row(self,df):
        old_row = list(self.curr_row)
        now_row = list(df.index)
        removed_row = set(old_row)-set(now_row)           
        logging.debug((old_row,now_row,removed_row))
        
        
        for x in removed_row:
            temp_row_pos = self.curr_row_pos[x]

            self.row_position.append((self.row_pos_id,x,self.state_id,-2,temp_row_pos[0]))
            self.curr_row_pos[x] = (self.row_pos_id,-2)
            self.row_pos_id+=1
            # next pos

            # filter with previous row_pos_id
            next_row = list(filter(lambda x:x[1][1]==x,self.curr_row_pos.items()))
            if len(next_row)>0:
                next_row = next_row[0]
                self.row_position.append((self.row_pos_id,next_row[0],self.state_id,temp_row_pos[1],next_row[1][0]))
                self.curr_row_pos[next_row[0]] = (self.row_pos_id,temp_row_pos[1])

                self.row_pos_id+=1        
        pass
    
    def change_values(self,df,change):
        #self.curr_index = self.pd_index[change]
        
        #change.columns = df.columns
        #change.index = df.index                
        
        #l_idx = self.pd_index[change]
                
        tt = np.where(np.matrix(change.to_numpy())==True)
        #print(tt)
        tt = list(zip(tt[0],tt[1]))
        #logging.debug((list(zip(tt[0],tt[1]))))
        #if tt[0].shape[0] == 1:
        #    tt = [(tt[0][0],tt[1][0])]
        #logging.debug((np.where(np.matrix(change.to_numpy())==True)))
        #logging.debug(("tt",tt))
        logging.debug(("change_values pd_index:",self.pd_index,change.to_numpy()))
        idx = self.pd_index.to_numpy()[change.to_numpy()].flatten()        
        val = df.fillna("").to_numpy()[change.to_numpy()].flatten()
        logging.debug(("idx:",idx,val))
        #list(idx)
        idx_list = list(filter(lambda x:pd.isna(x)!=True,idx))
        val_list = list(filter(lambda x:pd.isna(x)!=True,val))
        
        logging.debug(("change_values idxlist:",idx_list,val_list))
        
        #?filter
        
        #print(idx_list,val_list)
        
        set_temp_col = set()

        for x in zip(idx_list,val_list,tt):
            logging.debug(("change_values idxlist:",x))
            self.cell_values.append((self.value_id,x[0][0],self.state_id,x[1],x[0][1]))   
            #logging.debug(("x2",x[2]))
            ttx = list(self.pd_index.loc[x[2][0],x[2][1]])
            ttx[1] = self.value_id
            #logging.debug((ttx))
            self.pd_index.loc[x[2][0],x[2][1]] = tuple(ttx)
            #x[0] = self.value_id            
            
            
            self.value_derived_from.append((self.value_id,self.state_id,x[0][1]))
                        
            temp_col = list(filter(lambda y: y[0]==x[0][0],self.cell))[0][1]   
            if temp_col not in set_temp_col:
                self.col_dependency.append((self.state_id,temp_col,temp_col))  
                set_temp_col.add(temp_col)
                #aaa
            
            """            
            logging.debug(("in_col",in_col))
            for z in in_col:
                zzl = [x[-1] for x in list(filter(lambda x:(x[0]==z),self.curr_col_schema))]
                logging.debug(("zzl",zzl))
                for zz in zzl:                        
                    op_cid = list(filter(lambda x:(x[1]==zz)&(x[2]==i),self.cell))
                    logging.debug(("op_cid",op_cid))
                    for zzz in op_cid:
                        op_val = list(filter(lambda x:x[1]==zzz[0],self.cell_values))
                        logging.debug(("op_val",op_val))
                        for v in op_val:
                            self.value_derived_from.append((self.value_id,v[0]))
            """
            
            self.value_id+=1
            #print(x)

        #prev_state_id = state_id
        #state_id+=1
        #state.append((state_id,prev_state_id))
        
        pass
                    
    
    def change_df(self,prev_df,now_df,state_ss,user=None,prev_ss=None):  
        try:
            in_col,out_col = dep_input_output_col(prev_df,now_df,state_ss["operation"]["description"])
        
            # add linkage to the derived by cell
            # lookup value id for input col_value
            if len(out_col)==0:
                logging.debug(("in_col:",in_col,self.curr_col_schema))
                for z in in_col:
                    zzl = [x[-2] if len(x)==4 else x[-1] for x in list(filter(lambda x:(x[0]==z),self.curr_col_schema))]
                    logging.debug(("zzl",zzl))
                    for zzy in zzl:
                        self.col_dependency.append((self.state_id,zzy,zzy))                                
        except:
            pass
        
        #prev_col = np.array(self.curr_col)
        prev_col = prev_df.columns
        now_col = now_df.columns
        
        #print(prev_col,now_col)
        
        
        # condition for add and remove columns
        if len(now_col)>len(prev_col):
            # condition for add_columns:
            logging.debug("add column")
            self.add_column(prev_df,now_df,state_ss)            
        elif len(prev_col)>len(now_col):
            # condition for remove_columns:
            logging.debug("remove_column")
            self.remove_column(prev_df,now_df)
            
        # condition for change of schema:
        if len(prev_col)==len(now_col) and np.sum(prev_col!=now_col)>0:
            logging.debug("change column schema")
            self.change_column_schema(prev_df,now_df)
        
        #prev_row = np.array(self.curr_row)
        prev_row = np.array(list(prev_df.index))
        now_row = np.array(list(now_df.index))
        
        # condition for add and remove rows        
        if len(now_row)>len(prev_row):
            # condition for add_columns:
            logging.debug("add row")
            self.add_row(now_df)            
        elif len(prev_row)>len(now_row):
            # condition for remove_columns:
            logging.debug("remove row")
            self.remove_row(now_df)
            
        # condition for change of schema:
        if len(prev_row)==len(now_row) and np.sum(prev_row!=now_row)>1:
            logging.debug("change row position")
            self.change_row_position(prev_df,now_df)
            
        
        # condition for change of values
        try:
            change_val = now_df.fillna("")!=prev_df.fillna("")
            if change_val.to_numpy().sum()>0:
                logging.debug("change values")
                self.change_values(now_df,change_val)
            else:
                logging.debug("nothing change")
        except BaseException as ex:
            logging.debug(ex)
            pass
        
                
        #prev_state_id = self.state_id
        #prev_state_id = self.state_id
        #self.state_id+=1   
        try:
            self.user[user]
        except:
            self.user[user]=self.user_id
            self.user_id+=1
        
        if prev_ss == None:
            self.state_detail.append((self.state_id,state_ss,self.execution_id,self.prev_state_id,self.user[user]))       
            self.state.append((self.state_id,self.prev_state_id))    
            self.prev_state_id = self.state_id
            self.state_id+=1   
        else:             
            self.state_detail.append((self.state_id,state_ss,self.execution_id,prev_ss,self.user[user]))       
            self.state.append((self.state_id,prev_ss))       
            self.prev_state_id = self.state_id
            self.state_id+=1
            
        self.execution_id+=1
        
        return True
        
        """
        
        col_id = np.where(prev_df.columns==col.name)[0][0]
        columns.append((col_id,array_id))
        #temp_col = []
        for i,x in enumerate(col):
            if not row_processed:
                rows.append((i,array_id))
            #temp_col.append((cell_id,col_id,i))
            cells.append((cell_id,col_id,i))
            cell_values.append((value_id,cell_id,state_id,x,-1))
            cell_id+=1
            value_id+=1
            #print(i,col_id)
            pd_index.loc[i,col_id] = (cell_id,col_id,i)            
        row_processed = True
        state_id+=1
        """
        
    def transform(self):
        prev_df = None
        now_df = None
        for i,x in enumerate(self.trace):
            now_df = x[5]
            #print(now_df)
            if i == 0:
                #init dataset
                self.init_dataset(x)
            else:
                self.change_df(prev_df,now_df)
                pass
            
            
            self.curr_col = now_df.columns
            #self.curr_col_schema = [for x in self.curr_col]
            self.curr_row = list(now_df.index)        
            self.curr_df = now_df                        
            prev_df = now_df
        
        

# Load Data

In [4]:
original_pd = pd.read_csv("../case_1/chicago_vert_dataset.csv",dtype=str).append(pd.read_csv("../case_2/chicago_vert_dataset.csv",dtype=str))
original_pd = original_pd.fillna("").reset_index(drop=True)
for x in list(filter(lambda x:x.endswith("flag"),original_pd.columns)):
    original_pd.loc[:,x] = original_pd.loc[:,x].astype(int) 
original_pd_before = original_pd.copy()

  original_pd = pd.read_csv("../case_1/chicago_vert_dataset.csv",dtype=str).append(pd.read_csv("../case_2/chicago_vert_dataset.csv",dtype=str))
  original_pd.loc[:,x] = original_pd.loc[:,x].astype(int)


In [5]:
group1_index = pd.read_csv("../case_1/chicago_vert_dataset.csv",dtype=str).index
group1_index
group2_index = pd.Index(range(group1_index.stop,original_pd.shape[0]))

In [6]:
group1_index,group2_index

(RangeIndex(start=0, stop=1486, step=1),
 RangeIndex(start=1486, stop=3201, step=1))

In [7]:
original_pd.tail(50)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
3151,764622652958240880,Group book 3 Penthouse by Cloud9,248760412,Cloud9,,Near West Side,4188764.0,-87.64578,Entire home/apt,1488.0,...,R22000092079,0,0,0,0,0,0,0,0,0
3152,764669447707306735,South Loop Chicago-5 min to DT,448589644,Andrea,,Near South Side,41.86006,-87.62485,Entire home/apt,120.0,...,,0,0,0,0,0,0,0,0,0
3153,765003851418902674,"Bucktown 1br w/ roof & bbq, nr blue line",107434423,Blueground,,WeDstYC Town,41.91152789999999,-87.67875529999998,Entire home/apt,145.0,...,,0,0,0,0,0,0,0,0,0
3154,765213311537655730,Classic Hp 1 Br with Fast Transit to UChicago Dt,47172572,Zencity,,KgenRwoood,41.80302,-87.59742,Apartment,70.0,...,R22000092864,0,0,0,0,0,0,0,0,0
3155,765213311537655730,Classic Hp 1 Br with Fast Transit to UChicago Dt,47172572,Zencity,,KgenRwoood,41.80302,-87.59742,Apartment,70.0,...,R22000092864,0,0,0,0,0,0,0,0,0
3156,766069261296774780,Old-World Unit 3 Downtown!,170785489,Dmd,,Near North Side,41.90117,-87.6257,Entire home/apt,137.0,...,2210122,0,0,0,0,0,0,0,0,0
3157,766499608228095208,"Clean modern apt w/ parking, laundry",36989519,Mushky,,West Ridge,41.99345,-87.70609,Entire home/apt,91.0,...,R22000091827,0,0,0,0,0,0,0,0,0
3158,766657991814572268,Smoke n Stay - 2BR 10min from DT,468900645,Dana,,SoutHhtk Chicago,41.75017,-87.55126,Entire home/apt,99.0,...,R22000088337,0,0,0,0,0,0,0,0,0
3159,767226985954458283,"W. Loop 2br w/ gym & coworking, nr UIC",107434423,Blueground,,Near West Side,41.8763,-87.65371999999999,Entire home/apt,118.0,...,,0,0,0,0,0,0,0,0,0
3160,767434780399355268,100 home away from home,70728663,Shelby,,Bridgeport,41.82881576674946,-87.64687367897614,Private room,68.0,...,,0,0,0,0,0,0,0,0,0


In [8]:
super_ori_pd = pd.read_csv("../../../collaboration_simulation/airbnb_test_case/chicago_listings.csv",dtype=str)
super_ori_pd = super_ori_pd.fillna("")
for x in list(filter(lambda x:x.endswith("flag"),original_pd.columns)):
    super_ori_pd.loc[:,x] = 0
#super_ori_pd = super_ori_pd.loc[super_ori_pd.id.isin(original_pd.id),original_pd.columns]
#super_ori_pd
super_ori_pd = original_pd.merge(super_ori_pd,left_on="id",right_on="id",suffixes=("_x",""))
super_ori_pd = super_ori_pd.loc[:,original_pd.columns].reset_index(drop=True)

In [9]:
original_pd.describe()

Unnamed: 0,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
count,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
super_ori_pd.describe()

Unnamed: 0,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
count,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0,3201.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
original_pd

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
0,25879,2/1 One Block to Fullerton L Red Line Deck & ...,101521,Red,,Lincoln Park,41.92693,-87.65753,Entire home/apt,94,...,City registration pending,0,0,0,0,0,0,0,0,0
1,37738,Andersonville - Perfect location!,162364,Mat And Randy,,Uptown,41.9729,-87.66538,Private room,110,...,R20000059426,0,0,0,0,0,0,0,0,0
2,189821,"Best in Chicago, private, amazing garden space",899757,Meighan,,Logan Square,41.92918,-87.70219,Entire home/apt,236,...,R21000062936,0,0,0,0,0,0,0,0,0
3,207218,Historic Pullman Artist Flat - Artists & Explo...,1019125,Jb,,Pullman,41.6883,-87.60892,Entire home,100,...,R21000073121,0,0,0,0,0,0,0,0,0
4,220333,Pullman School House Apartment - monthly rental,1019125,Jb,,Pullman,41.68815,-87.60918,Entire home/apt,100,...,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196,782628636832878491,"Steps to Shop, Eat, Train | Easy Access | Zencity",47172572,Zencity,,West Town,41.8955,-87.66124,Entire home/apt,50,...,R19000043484,0,0,0,0,0,0,0,0,0
3197,782643895516370805,Old Town Oasis,169297663,William,,Near North Side,41.90105,-87.63716,Apartment,120,...,R22000093645,0,0,0,0,0,0,0,0,0
3198,784994899201350568,Lovely 1 bed Apt in River North,52827024,Yakir,,Near West Side,41.88822006416301,-87.64145321718578,Entire home/apt,84,...,,0,0,0,0,0,0,0,0,0
3199,785423932330914663,"River North 1br w/ gym, pool & roof, nr Riverwalk",107434423,Blueground,,Near North Side,41.890516,-87.635955,Apartment,169,...,,0,0,0,0,0,0,0,0,0


In [12]:
original_pd.shape,super_ori_pd.shape

((3201, 27), (3201, 27))

In [13]:
import numpy as np
import json
import pandas as pd
import pickle

#dcm1 = TransformDCM(None)
with open("dcm_collab_c1_c2.pickle","rb") as file:
    dcm1 = pickle.load(file)

In [14]:
#dcm1.init_dataset_df(super_ori_pd)

In [15]:
#dcm1.curr_col = super_ori_pd.columns
#dcm1.curr_row = list(super_ori_pd.index) 
#dcm1.curr_df = super_ori_pd

In [16]:
#dcm1.change_df(super_ori_pd,original_pd,"perturbed_dataset","s1")

In [17]:
xx = pd.DataFrame(dcm1.cell_values)
xx[xx[2]==0].merge(xx,left_on=4,right_on=0)

Unnamed: 0,4,0_x,1_x,2_x,3_x,4_x,0_y,1_y,2_y,3_y,4_y
0,66,86427,66,0,"November 28, 2022",66,66,66,-1,2022-11-28,-1
1,89,86428,89,0,Entire home,89,89,89,-1,Entire home/apt,-1
2,167,86429,167,0,Lhke eieu,167,167,167,-1,Lake View,-1
3,170,86430,170,0,Entire home,170,170,170,-1,Entire home/apt,-1
4,224,86431,224,0,Entire home,224,224,224,-1,Entire home/apt,-1
...,...,...,...,...,...,...,...,...,...,...,...
2286,86246,88713,86246,0,Apartment,86246,86246,86246,-1,Entire home/apt,-1
2287,86273,88714,86273,0,Apartment,86273,86273,86273,-1,Entire home/apt,-1
2288,86327,88715,86327,0,Apartment,86327,86327,86327,-1,Entire home/apt,-1
2289,86381,88716,86381,0,Apartment,86381,86381,86381,-1,Entire home/apt,-1


In [18]:
dcm1.state_detail

[(0, {'op': 'initial'}, -1, -1, None),
 (0, 'perturbed_dataset', 0, -1, 0),
 (1, 'clean_duplicate_id_c1', 1, 0, 1),
 (2, 'clean_inconsistent_host_id_c1', 2, 1, 1),
 (3, 'clean_neighbourhood_c1', 3, 2, 1),
 (4, 'clean_latitude_c1', 4, 3, 1),
 (5, 'clean_longitude_c1', 5, 4, 1),
 (6, 'clean_room_type_c1', 6, 5, 1),
 (7, 'clean_minimum_nights_c1', 7, 6, 1),
 (8, 'clean_number_of_reviews_c1', 8, 7, 1),
 (9, 'clean_last_review_c1', 9, 8, 1),
 (10, 'clean_duplicate_id_c2', 10, 0, 2),
 (11, 'clean_inconsistent_host_id_c2', 11, 10, 2),
 (12, 'clean_neighbourhood_c2', 12, 11, 2),
 (13, 'clean_latitude_c2', 13, 12, 2),
 (14, 'clean_longitude_c2', 14, 13, 2),
 (15, 'clean_room_type_c2', 15, 14, 2),
 (16, 'clean_minimum_nights_c2', 16, 15, 2),
 (17, 'clean_number_of_reviews_c2', 17, 16, 2),
 (18, 'clean_last_review_c2', 18, 17, 2)]

In [19]:
#dcm1.curr_col = original_pd.columns
#dcm1.curr_row = list(original_pd.index) 
#dcm1.curr_df = original_pd

In [20]:
#for i,c in enumerate(change_index.columns):
#    dcm1.pd_index.loc[change_index.loc[:,c].values,i]

In [21]:
def assign_to_original(original_pd,workset_pd):
    original_pd.loc[original_pd.id.isin(workset_pd.id),:] = workset_pd
    return original_pd

In [22]:
airbnb_pd = pd.read_csv("../../../collaboration_simulation/airbnb_test_case/chicago_vert_2_a.csv",dtype=str)
for x in list(filter(lambda x:x.endswith("flag"),airbnb_pd.columns)):
    airbnb_pd.loc[:,x] = airbnb_pd.loc[:,x].astype(int) 
airbnb_pd.index = group2_index

  airbnb_pd.loc[:,x] = airbnb_pd.loc[:,x].astype(int)


In [23]:
airbnb_pd.latitude.sort_values(ascending=False)

3171    42.01833
1767    42.01653
1759    42.01653
1502    42.01653
1766    42.01653
          ...   
2543         NaN
2557         NaN
2657         NaN
3014         NaN
3136         NaN
Name: latitude, Length: 1715, dtype: object

In [24]:
airbnb_pd.shape,original_pd.shape

((1715, 27), (3201, 27))

In [25]:
# take part of airbnb_pd from original_pd
# using data frame imutability, any changes will be 
# directed toward those subset of original_pd
#airbnb_pd = original_pd[original_pd.id.isin(airbnb_pd.id)]

# cleanup latitude and longitude
The latitude and longitude values in the dataset must fall within the range of -90 to +90 for latitude and -180 to +180 for longitude to ensure that they meet the criteria for analysis. We have provided a check number function to validate the latitude and longitude columns. Any values outside of these ranges should be cleaned to meet the criteria.

If you are unsure what to do with a value or if it is a null value, you can flag the row for deletion by setting latitude_flag or longitude_flag to 1 or 2, respectively.

In [26]:
def check_number(x,start=-90,end=90):
    try:
        temp_x = float(x)
        return (start <= temp_x <= end)
    except:
        return False

In [27]:
def adjust_decimal(x,dec_pos=2):
    #raise Exception("not yet have implementation")
    # do something here
    # must be a floating point
    x = float(x)
    abs_x = abs(x)
    num_str = str(abs_x)
    decimal_pos = num_str.find('.')
    if decimal_pos == -1:
        #consider everything
        digits_before_decimal = len(num_str)
    else:
        digits_before_decimal = decimal_pos
    divisor = 10**(digits_before_decimal - dec_pos)
    
    return x / divisor
    
    #how many values behind coma
    
    return df

In [28]:
def clean_latitude(df):
    #raise Exception("not yet have implementation")
    # do something here
    df["latitude"] = df["latitude"].apply(lambda x: x if check_number(x) else adjust_decimal(x))
    df["latitude_flag"] = df["latitude"].apply(lambda x: 2 if pd.isnull(x) else x)
    df["latitude_flag"] = df["latitude"].apply(lambda x: 0 if check_number(x) else 1)
    return df

In [29]:
airbnb_pd.latitude.sort_values(ascending=False)

3171    42.01833
1767    42.01653
1759    42.01653
1502    42.01653
1766    42.01653
          ...   
2543         NaN
2557         NaN
2657         NaN
3014         NaN
3136         NaN
Name: latitude, Length: 1715, dtype: object

In [30]:
# apply the clean_latitude function to the 'latitude' column
airbnb_pd = clean_latitude(airbnb_pd)
# airbnb_pd.loc[airbnb_pd['latitude_flag']==1, 'latitude']
#airbnb_pd.loc[airbnb_pd['latitude_flag']==1, 'latitude'] = airbnb_pd.loc[airbnb_pd['latitude_flag']==1, 'latitude'].apply(lambda x: clean_latitude_pls(x))

# Latitude checking

This query should return zero rows once you implement the cleaning process

In [31]:
lat_check_pd = airbnb_pd[airbnb_pd.latitude_flag==0]
lat_check_pd = lat_check_pd[lat_check_pd.latitude.apply(lambda x:check_number(x,-90,90))==False]
lat_check_pd[["id","latitude"]]

Unnamed: 0,id,latitude


In [32]:
workset_before = original_pd.copy()
workset_after = original_pd.copy()

In [33]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [34]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                 0
name                               0
host_id                            0
host_name                          0
neighbourhood_group                0
neighbourhood                      0
latitude                          98
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
number_of_reviews_ltm              0
license                            0
id_flag                            0
host_id_flag                       0
neighbourhood_flag                 0
latitude_flag                      8
longitude_flag                     0
minimum_nights_flag                0
number_of_reviews_flag             0
last_review_flag                   0
room_type_flag                     0
d

In [35]:
dcm1.change_df(workset_before,workset_after,"clean_latitude_c3","c3",0)

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(40155, 87487, 1487, 6), (40182, 87488, 1488, 6), (40425, 87492, 1497, 6), (40830, 87501, 1512, 6), (41640, 87521, 1542, 6), (41856, 87531, 1550, 6), (41964, 87534, 1554, 6), (41991, 87536, 1555, 6), (42558, 87549, 1576, 6), (43287, 87564, 1603, 6), (43854, 87579, 1624, 6), (43881, 87580, 1625, 6), (43962, 87583, 1628, 6), (44556, 87594, 1650, 6), (44583, 87595, 1651, 6), (44799, 87598, 1659, 6), (44826, 87601, 1660, 6), (44907, 87603, 1663, 6), (45231, 87606, 1675, 6), (45258, 87609, 1676, 6), (45393, 87616, 1681, 6), (45420, 87621, 1682, 6), (46851, 87665, 1735, 6), (47121, 87676, 1745, 6), (47931, 87692, 1775, 6), (48660, 87700, 1802, 6), (49119, 87708, 1819, 6), (50388, 87744, 1866, 6), (50496, 87745, 1870, 6), (50523, 87746, 1871, 6), (50604, 87751, 1874, 6), (50712, 87752, 1878, 6), (51306, 87764, 1900, 6), (51360, 87766, 1902, 6), (51402, 91015, 1903, 21), (54357, 87839, 2013, 6), (54573, 87844, 2021, 6), (55005, 87854, 2037, 6), (55950, 87

DEBUG:root:('change_values idxlist:', ((61728, 88039, 2286, 6), 41.95546, (2286, 6)))
DEBUG:root:('change_values idxlist:', ((62619, 88064, 2319, 6), 41.89678629999999, (2319, 6)))
DEBUG:root:('change_values idxlist:', ((62700, 88068, 2322, 6), 41.98756, (2322, 6)))
DEBUG:root:('change_values idxlist:', ((62727, 88069, 2323, 6), 41.98756, (2323, 6)))
DEBUG:root:('change_values idxlist:', ((64185, 88103, 2377, 6), 41.88544, (2377, 6)))
DEBUG:root:('change_values idxlist:', ((64995, 88130, 2407, 6), 41.90732, (2407, 6)))
DEBUG:root:('change_values idxlist:', ((65589, 88150, 2429, 6), 41.88082, (2429, 6)))
DEBUG:root:('change_values idxlist:', ((66075, 88162, 2447, 6), 41.90192, (2447, 6)))
DEBUG:root:('change_values idxlist:', ((66210, 88166, 2452, 6), 41.7952995300293, (2452, 6)))
DEBUG:root:('change_values idxlist:', ((66993, 88180, 2481, 6), 41.8723, (2481, 6)))
DEBUG:root:('change_values idxlist:', ((67641, 88203, 2505, 6), 41.8634989, (2505, 6)))
DEBUG:root:('change_values idxlist:'

True

In [36]:
dcm1.state_detail

[(0, {'op': 'initial'}, -1, -1, None),
 (0, 'perturbed_dataset', 0, -1, 0),
 (1, 'clean_duplicate_id_c1', 1, 0, 1),
 (2, 'clean_inconsistent_host_id_c1', 2, 1, 1),
 (3, 'clean_neighbourhood_c1', 3, 2, 1),
 (4, 'clean_latitude_c1', 4, 3, 1),
 (5, 'clean_longitude_c1', 5, 4, 1),
 (6, 'clean_room_type_c1', 6, 5, 1),
 (7, 'clean_minimum_nights_c1', 7, 6, 1),
 (8, 'clean_number_of_reviews_c1', 8, 7, 1),
 (9, 'clean_last_review_c1', 9, 8, 1),
 (10, 'clean_duplicate_id_c2', 10, 0, 2),
 (11, 'clean_inconsistent_host_id_c2', 11, 10, 2),
 (12, 'clean_neighbourhood_c2', 12, 11, 2),
 (13, 'clean_latitude_c2', 13, 12, 2),
 (14, 'clean_longitude_c2', 14, 13, 2),
 (15, 'clean_room_type_c2', 15, 14, 2),
 (16, 'clean_minimum_nights_c2', 16, 15, 2),
 (17, 'clean_number_of_reviews_c2', 17, 16, 2),
 (18, 'clean_last_review_c2', 18, 17, 2),
 (19, 'clean_latitude_c3', 19, 0, 3)]

In [37]:
workset_before = workset_after.copy()

In [39]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==19]

Unnamed: 0,0,1,2,3,4
91922,91922,40155,19,41.89615,87487
91923,91923,40182,19,41.89615,87488
91924,91924,40425,19,41.88999,87492
91925,91925,40830,19,41.68582,87501
91926,91926,41640,19,41.89592,87521
...,...,...,...,...,...
92023,92023,85785,19,41.8167,88695
92024,92024,85893,19,41.894529,88699
92025,92025,85974,19,41.90362,88708
92026,92026,86190,19,41.90271,88711


In [40]:
def clean_longitude(df):
    #raise Exception("not yet have implementation")
    # do something here
    
    df["longitude"] = df["longitude"].apply(lambda x: x if check_number(x) else adjust_decimal(x))
    df["longitude_flag"] = df["longitude"].apply(lambda x: 0 if check_number(x,-180,180) else 2)
        
    return df

In [41]:
airbnb_pd = clean_longitude(airbnb_pd)
airbnb_pd[airbnb_pd['longitude_flag']==1]['longitude']

Series([], Name: longitude, dtype: object)

# Longitude checking

This query should return zero rows once you implement the cleaning process

In [42]:
lon_check_pd = airbnb_pd[airbnb_pd.longitude_flag==0]
lon_check_pd = lon_check_pd[lon_check_pd.longitude.apply(lambda x:check_number(x,-180,180))==False]
lon_check_pd[["id","longitude"]]

Unnamed: 0,id,longitude


In [43]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [44]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                 0
name                               0
host_id                            0
host_name                          0
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                         93
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
number_of_reviews_ltm              0
license                            0
id_flag                            0
host_id_flag                       0
neighbourhood_flag                 0
latitude_flag                      0
longitude_flag                     2
minimum_nights_flag                0
number_of_reviews_flag             0
last_review_flag                   0
room_type_flag                     0
d

In [45]:
dcm1.change_df(workset_before,workset_after,"clean_longitude_c3","c3")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(41587, 87518, 1540, 7), (42208, 87540, 1563, 7), (42667, 87551, 1580, 7), (43396, 87568, 1607, 7), (43828, 87578, 1623, 7), (45637, 87629, 1690, 7), (45664, 87630, 1691, 7), (48013, 87693, 1778, 7), (49093, 87706, 1818, 7), (49174, 87711, 1821, 7), (50281, 87743, 1862, 7), (50578, 87749, 1873, 7), (51172, 87759, 1895, 7), (51307, 87765, 1900, 7), (51631, 87774, 1912, 7), (51658, 87779, 1913, 7), (52306, 87791, 1937, 7), (52333, 87792, 1938, 7), (53359, 87816, 1976, 7), (53737, 87824, 1990, 7), (53818, 87829, 1993, 7), (53845, 87830, 1994, 7), (54655, 87845, 2024, 7), (54682, 87846, 2025, 7), (54763, 87847, 2028, 7), (54790, 87849, 2029, 7), (55762, 87873, 2065, 7), (55924, 87877, 2071, 7), (56113, 87887, 2078, 7), (56383, 87905, 2088, 7), (57436, 87932, 2127, 7), (57490, 87936, 2129, 7), (57841, 87943, 2142, 7), (58666, 91120, 2172, 22), (58678, 87962, 2173, 7), (59542, 87982, 2205, 7), (59569, 87984, 2206, 7), (59785, 87989, 2214, 7), (60055, 87

DEBUG:root:('change_values idxlist:', ((67993, 88212, 2518, 7), -87.64142, (2518, 7)))
DEBUG:root:('change_values idxlist:', ((68020, 88214, 2519, 7), -87.64142, (2519, 7)))
DEBUG:root:('change_values idxlist:', ((68128, 88217, 2523, 7), -87.67293, (2523, 7)))
DEBUG:root:('change_values idxlist:', ((68533, 88225, 2538, 7), -87.63264, (2538, 7)))
DEBUG:root:('change_values idxlist:', ((68830, 88234, 2549, 7), -87.64641, (2549, 7)))
DEBUG:root:('change_values idxlist:', ((69061, 91146, 2557, 22), 2, (2557, 22)))
DEBUG:root:('change_values idxlist:', ((69262, 88242, 2565, 7), -87.68813, (2565, 7)))
DEBUG:root:('change_values idxlist:', ((69289, 88243, 2566, 7), -87.67422, (2566, 7)))
DEBUG:root:('change_values idxlist:', ((69640, 88253, 2579, 7), -87.62704269999999, (2579, 7)))
DEBUG:root:('change_values idxlist:', ((70315, 88267, 2604, 7), -87.7169, (2604, 7)))
DEBUG:root:('change_values idxlist:', ((71557, 88300, 2650, 7), -87.64739, (2650, 7)))
DEBUG:root:('change_values idxlist:', ((7

True

In [46]:
dcm1.state_detail

[(0, {'op': 'initial'}, -1, -1, None),
 (0, 'perturbed_dataset', 0, -1, 0),
 (1, 'clean_duplicate_id_c1', 1, 0, 1),
 (2, 'clean_inconsistent_host_id_c1', 2, 1, 1),
 (3, 'clean_neighbourhood_c1', 3, 2, 1),
 (4, 'clean_latitude_c1', 4, 3, 1),
 (5, 'clean_longitude_c1', 5, 4, 1),
 (6, 'clean_room_type_c1', 6, 5, 1),
 (7, 'clean_minimum_nights_c1', 7, 6, 1),
 (8, 'clean_number_of_reviews_c1', 8, 7, 1),
 (9, 'clean_last_review_c1', 9, 8, 1),
 (10, 'clean_duplicate_id_c2', 10, 0, 2),
 (11, 'clean_inconsistent_host_id_c2', 11, 10, 2),
 (12, 'clean_neighbourhood_c2', 12, 11, 2),
 (13, 'clean_latitude_c2', 13, 12, 2),
 (14, 'clean_longitude_c2', 14, 13, 2),
 (15, 'clean_room_type_c2', 15, 14, 2),
 (16, 'clean_minimum_nights_c2', 16, 15, 2),
 (17, 'clean_number_of_reviews_c2', 17, 16, 2),
 (18, 'clean_last_review_c2', 18, 17, 2),
 (19, 'clean_latitude_c3', 19, 0, 3),
 (20, 'clean_longitude_c3', 20, 19, 3)]

In [47]:
workset_before = workset_after.copy()

In [58]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==20]

Unnamed: 0,0,1,2,3,4
92028,92028,41587,20,-87.72881,87518
92029,92029,42208,20,-87.63383,87540
92030,92030,42667,20,-87.68732,87551
92031,92031,43396,20,-87.66189,87568
92032,92032,43828,20,-87.69208,87578
...,...,...,...,...,...
92118,92118,83248,20,-87.652692,88617
92119,92119,83653,20,-87.66632,88627
92120,92120,84058,20,-87.62559,88639
92121,92121,84436,20,-87.72144,88651


# cleanup room type
The "room_type" column in the dataset should contain one of the values defined in the list of allowed_room_type provided by the authority: ['Entire home/apt', 'Private room', 'Shared room', 'Hotel room']. Any value outside of this list needs to be adjusted to one of the allowed values.

If you are unsure about how to adjust the value or cannot find a suitable value, you can flag the row for deletion by setting the value of room_type_flag to 1. If the "room_type" column has a null value and you cannot decide on an appropriate value, you can set the value of room_type_flag to 2.

In [49]:
allowed_room_type = ['Entire home/apt', 'Private room', 'Shared room', 'Hotel room']

In [50]:
def clean_room_type(df):
    #raise Exception("not yet have implementation")
    # do something here
    # nikolaus add room type cleaning based on curator 1 algorithm
    airbnb_pd['room_type'] = airbnb_pd['room_type'].replace({'Apartment': 'Entire home/apt'})
    df.loc[~df['room_type'].isin(allowed_room_type), 'room_type_flag'] = 1
    df.loc[df['room_type'].isna(), 'room_type_flag'] = 2 
    return df

In [51]:
airbnb_pd = clean_room_type(airbnb_pd)

# room_type checking

This query should return zero rows once you implement the cleaning process

In [52]:
room_type_pd = airbnb_pd[airbnb_pd.room_type_flag==0]
room_type_pd = room_type_pd[room_type_pd.room_type.apply(lambda x: x not in allowed_room_type)]
room_type_pd[["id","room_type"]]

Unnamed: 0,id,room_type


In [53]:
workset_after = assign_to_original(workset_after,airbnb_pd)

In [54]:
(workset_after.fillna("")!=workset_before.fillna("")).sum()

id                                  0
name                                0
host_id                             0
host_name                           0
neighbourhood_group                 0
neighbourhood                       0
latitude                            0
longitude                           0
room_type                         213
price                               0
minimum_nights                      0
number_of_reviews                   0
last_review                         0
reviews_per_month                   0
calculated_host_listings_count      0
availability_365                    0
number_of_reviews_ltm               0
license                             0
id_flag                             0
host_id_flag                        0
neighbourhood_flag                  0
latitude_flag                       0
longitude_flag                      0
minimum_nights_flag                 0
number_of_reviews_flag              0
last_review_flag                    0
room_type_fl

In [55]:
dcm1.change_df(workset_before,workset_after,"clean_room_type_c3","c3")

DEBUG:root:change values
DEBUG:root:('change_values pd_index:',                            0                        1   \
0                (0, 0, 0, 0)             (1, 1, 0, 1)   
1              (27, 27, 1, 0)           (28, 28, 1, 1)   
2              (54, 54, 2, 0)           (55, 55, 2, 1)   
3              (81, 81, 3, 0)           (82, 82, 3, 1)   
4            (108, 108, 4, 0)         (109, 109, 4, 1)   
...                       ...                      ...   
3196  (86292, 86292, 3196, 0)  (86293, 86293, 3196, 1)   
3197  (86319, 86319, 3197, 0)  (86320, 86320, 3197, 1)   
3198  (86346, 86346, 3198, 0)  (86347, 86347, 3198, 1)   
3199  (86373, 86373, 3199, 0)  (86374, 86374, 3199, 1)   
3200  (86400, 86400, 3200, 0)  (86401, 86401, 3200, 1)   

                           2                        3   \
0                (2, 2, 0, 2)             (3, 3, 0, 3)   
1              (29, 29, 1, 2)           (30, 30, 1, 3)   
2              (56, 56, 2, 2)           (57, 57, 2, 3)   
3      

DEBUG:root:('change_values idxlist:', [(40751, 87498, 1509, 8), (40859, 87502, 1513, 8), (41075, 87505, 1521, 8), (41210, 87511, 1526, 8), (41615, 87520, 1541, 8), (41642, 87522, 1542, 8), (41804, 87528, 1548, 8), (42047, 87538, 1557, 8), (42965, 87555, 1591, 8), (43289, 87565, 1603, 8), (43694, 87573, 1618, 8), (43802, 87577, 1622, 8), (44018, 87584, 1630, 8), (44045, 87585, 1631, 8), (44504, 87593, 1648, 8), (44801, 87599, 1659, 8), (44828, 87602, 1660, 8), (45413, 91199, 1681, 26), (45503, 87624, 1685, 8), (46016, 87638, 1704, 8), (46124, 87641, 1708, 8), (46151, 87642, 1709, 8), (46178, 87644, 1710, 8), (46205, 87646, 1711, 8), (46232, 87649, 1712, 8), (46664, 87662, 1728, 8), (46691, 87663, 1729, 8), (46799, 87664, 1733, 8), (46988, 87672, 1740, 8), (47015, 87674, 1741, 8), (47528, 87684, 1760, 8), (47798, 87689, 1770, 8), (48716, 87701, 1804, 8), (49310, 87715, 1826, 8), (49391, 87719, 1829, 8), (49553, 87724, 1835, 8), (50066, 87737, 1854, 8), (51632, 87775, 1912, 8), (51659, 87

DEBUG:root:('change_values idxlist:', ((40751, 87498, 1509, 8), 'Entire home/apt', (1509, 8)))
DEBUG:root:('change_values idxlist:', ((40859, 87502, 1513, 8), 'Entire home/apt', (1513, 8)))
DEBUG:root:('change_values idxlist:', ((41075, 87505, 1521, 8), 'Entire home/apt', (1521, 8)))
DEBUG:root:('change_values idxlist:', ((41210, 87511, 1526, 8), 'Entire home/apt', (1526, 8)))
DEBUG:root:('change_values idxlist:', ((41615, 87520, 1541, 8), 'Entire home/apt', (1541, 8)))
DEBUG:root:('change_values idxlist:', ((41642, 87522, 1542, 8), 'Entire home/apt', (1542, 8)))
DEBUG:root:('change_values idxlist:', ((41804, 87528, 1548, 8), 'Entire home/apt', (1548, 8)))
DEBUG:root:('change_values idxlist:', ((42047, 87538, 1557, 8), 'Entire home/apt', (1557, 8)))
DEBUG:root:('change_values idxlist:', ((42965, 87555, 1591, 8), 'Entire home/apt', (1591, 8)))
DEBUG:root:('change_values idxlist:', ((43289, 87565, 1603, 8), 'Entire home/apt', (1603, 8)))
DEBUG:root:('change_values idxlist:', ((43694, 875

DEBUG:root:('change_values idxlist:', ((60677, 88008, 2247, 8), 'Entire home/apt', (2247, 8)))
DEBUG:root:('change_values idxlist:', ((60785, 88010, 2251, 8), 'Entire home/apt', (2251, 8)))
DEBUG:root:('change_values idxlist:', ((60866, 88012, 2254, 8), 'Entire home/apt', (2254, 8)))
DEBUG:root:('change_values idxlist:', ((60893, 88014, 2255, 8), 'Entire home/apt', (2255, 8)))
DEBUG:root:('change_values idxlist:', ((61271, 88020, 2269, 8), 'Entire home/apt', (2269, 8)))
DEBUG:root:('change_values idxlist:', ((61595, 88033, 2281, 8), 'Entire home/apt', (2281, 8)))
DEBUG:root:('change_values idxlist:', ((61838, 88044, 2290, 8), 'Entire home/apt', (2290, 8)))
DEBUG:root:('change_values idxlist:', ((61919, 88046, 2293, 8), 'Entire home/apt', (2293, 8)))
DEBUG:root:('change_values idxlist:', ((62108, 88050, 2300, 8), 'Entire home/apt', (2300, 8)))
DEBUG:root:('change_values idxlist:', ((62135, 88051, 2301, 8), 'Entire home/apt', (2301, 8)))
DEBUG:root:('change_values idxlist:', ((62162, 880

DEBUG:root:('change_values idxlist:', ((78848, 88499, 2920, 8), 'Entire home/apt', (2920, 8)))
DEBUG:root:('change_values idxlist:', ((79145, 88509, 2931, 8), 'Entire home/apt', (2931, 8)))
DEBUG:root:('change_values idxlist:', ((79388, 88514, 2940, 8), 'Entire home/apt', (2940, 8)))
DEBUG:root:('change_values idxlist:', ((79469, 88515, 2943, 8), 'Entire home/apt', (2943, 8)))
DEBUG:root:('change_values idxlist:', ((79496, 88516, 2944, 8), 'Entire home/apt', (2944, 8)))
DEBUG:root:('change_values idxlist:', ((79523, 88517, 2945, 8), 'Entire home/apt', (2945, 8)))
DEBUG:root:('change_values idxlist:', ((79793, 88529, 2955, 8), 'Entire home/apt', (2955, 8)))
DEBUG:root:('change_values idxlist:', ((79847, 88530, 2957, 8), 'Entire home/apt', (2957, 8)))
DEBUG:root:('change_values idxlist:', ((79982, 88533, 2962, 8), 'Entire home/apt', (2962, 8)))
DEBUG:root:('change_values idxlist:', ((80765, 88552, 2991, 8), 'Entire home/apt', (2991, 8)))
DEBUG:root:('change_values idxlist:', ((80927, 885

True

In [56]:
workset_before = workset_after.copy()

In [59]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd[check_pd[2]==21]

Unnamed: 0,0,1,2,3,4
92123,92123,40751,21,Entire home/apt,87498
92124,92124,40859,21,Entire home/apt,87502
92125,92125,41075,21,Entire home/apt,87505
92126,92126,41210,21,Entire home/apt,87511
92127,92127,41615,21,Entire home/apt,87520
...,...,...,...,...,...
92338,92338,85976,21,Entire home/apt,88709
92339,92339,86246,21,Entire home/apt,88713
92340,92340,86273,21,Entire home/apt,88714
92341,92341,86327,21,Entire home/apt,88715


In [60]:
check_pd = pd.DataFrame(dcm1.cell_values)
check_pd.groupby(2).count()

Unnamed: 0_level_0,0,1,3,4
2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,86427,86427,86427,86427
0,2291,2291,2291,2291
1,70,70,70,70
2,354,354,354,354
3,322,322,322,322
4,84,84,84,84
5,90,90,90,90
6,198,198,198,198
7,28,28,28,28
8,47,47,47,47


In [61]:
pd.DataFrame(dcm1.column_position)

Unnamed: 0,0,1,2,3,4,5
0,0,0,-1,id,-1,-1
1,1,1,-1,name,0,-1
2,2,2,-1,host_id,1,-1
3,3,3,-1,host_name,2,-1
4,4,4,-1,neighbourhood_group,3,-1
5,5,5,-1,neighbourhood,4,-1
6,6,6,-1,latitude,5,-1
7,7,7,-1,longitude,6,-1
8,8,8,-1,room_type,7,-1
9,9,9,-1,price,8,-1


In [62]:
dcm1.state_detail[0] = (0, {'op': 'initial'}, -1, -1, None)

In [63]:
pd.DataFrame(dcm1.state_detail)

Unnamed: 0,0,1,2,3,4
0,0,{'op': 'initial'},-1,-1,
1,0,perturbed_dataset,0,-1,0.0
2,1,clean_duplicate_id_c1,1,0,1.0
3,2,clean_inconsistent_host_id_c1,2,1,1.0
4,3,clean_neighbourhood_c1,3,2,1.0
5,4,clean_latitude_c1,4,3,1.0
6,5,clean_longitude_c1,5,4,1.0
7,6,clean_room_type_c1,6,5,1.0
8,7,clean_minimum_nights_c1,7,6,1.0
9,8,clean_number_of_reviews_c1,8,7,1.0


In [64]:
cv = pd.DataFrame(dcm1.cell_values)
cv[cv[2]==9]

Unnamed: 0,0,1,2,3,4
89911,89911,66,9,2022-11-28,86427
89912,89912,79,9,0,79
89913,89913,430,9,2,430
89914,89914,741,9,2022-11-27,86448
89915,89915,754,9,0,754
...,...,...,...,...,...
90320,90320,40012,9,2,40012
90321,90321,40039,9,2,40039
90322,90322,40066,9,2,40066
90323,90323,40093,9,2,40093


In [65]:
pd.DataFrame(dcm1.state)

Unnamed: 0,0,1
0,-1,-2
1,0,-1
2,1,0
3,2,1
4,3,2
5,4,3
6,5,4
7,6,5
8,7,6
9,8,7


In [66]:
pd.DataFrame(dcm1.cell_values).merge(pd.DataFrame(dcm1.cell),left_on=1,right_on=0,suffixes=["_x","_y"]).\
merge(pd.DataFrame(dcm1.column_position),left_on="1_y",right_on=1,suffixes=["_xx","_z"]).\
merge(pd.DataFrame(dcm1.state_detail),left_on="2_x",right_on=0,suffixes=["_zz","_k"])

Unnamed: 0,1_xx,0_x,1_x,2_x,3_xx,4_xx,0_y,1_y,2_y,0_zz,1_z,2_zz,3_z,4_z,5,0_k,1,2_k,3,4
0,4699,86536,4699,0,,4699,4699,1,174,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
1,4699,86536,4699,0,,4699,4699,1,174,1,1,-1,name,0,-1,0,perturbed_dataset,0,-1,0.0
2,5563,86558,5563,0,,5563,5563,1,206,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
3,5563,86558,5563,0,,5563,5563,1,206,1,1,-1,name,0,-1,0,perturbed_dataset,0,-1,0.0
4,6913,86607,6913,0,,6913,6913,1,256,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8202,85994,91397,85994,15,1,85994,85994,26,3184,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8203,86264,91398,86264,15,1,86264,86264,26,3194,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8204,86291,91399,86291,15,1,86291,86291,26,3195,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8205,86345,91400,86345,15,1,86345,86345,26,3197,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0


In [67]:
check_pd = pd.DataFrame(dcm1.cell_values).merge(pd.DataFrame(dcm1.cell),left_on=1,right_on=0,suffixes=["_x","_y"]). \
    merge(pd.DataFrame(dcm1.column_position),left_on="1_y",right_on=1,suffixes=["_xx","_z"]). \
    merge(pd.DataFrame(dcm1.state_detail),left_on="2_x",right_on=0,suffixes=["_zz","_k"]) #.drop_duplicates()


In [68]:
check_pd[check_pd["3_xx"].isna()==False]

Unnamed: 0,1_xx,0_x,1_x,2_x,3_xx,4_xx,0_y,1_y,2_y,0_zz,1_z,2_zz,3_z,4_z,5,0_k,1,2_k,3,4
0,4699,86536,4699,0,,4699,4699,1,174,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
1,4699,86536,4699,0,,4699,4699,1,174,1,1,-1,name,0,-1,0,perturbed_dataset,0,-1,0.0
2,5563,86558,5563,0,,5563,5563,1,206,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
3,5563,86558,5563,0,,5563,5563,1,206,1,1,-1,name,0,-1,0,perturbed_dataset,0,-1,0.0
4,6913,86607,6913,0,,6913,6913,1,256,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8202,85994,91397,85994,15,1,85994,85994,26,3184,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8203,86264,91398,86264,15,1,86264,86264,26,3194,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8204,86291,91399,86291,15,1,86291,86291,26,3195,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8205,86345,91400,86345,15,1,86345,86345,26,3197,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0


In [69]:
cv

Unnamed: 0,0,1,2,3,4
0,0,0,-1,25879,-1
1,1,1,-1,2/1 One Block to Fullerton L Red Line Deck & ...,-1
2,2,2,-1,101521,-1
3,3,3,-1,Red,-1
4,4,4,-1,,-1
...,...,...,...,...,...
92338,92338,85976,21,Entire home/apt,88709
92339,92339,86246,21,Entire home/apt,88713
92340,92340,86273,21,Entire home/apt,88714
92341,92341,86327,21,Entire home/apt,88715


In [70]:
cv = pd.DataFrame(dcm1.cell_values)
cellv = pd.DataFrame(dcm1.cell)
cv[cv[2]==0].merge(cellv[cellv[1]==6],left_on=1,right_on=0)
#test_cv = cv[cv[2]==4].merge(cv[cv[2]==-1],left_on=1,right_on=1)
#test_cv[test_cv["3_x"].apply(lambda x:float(x))==test_cv["3_y"].apply(lambda x:float(x))]

Unnamed: 0,1,0_x,1_x,2_x,3,4,0_y,1_y,2_y
0,3624,86509,3624,0,E41.9A2748,3624,3624,6,134
1,5082,86548,5082,0,4L1.9320u3,5082,5082,6,188
2,5163,86551,5163,0,41.9vB3123,5163,5163,6,191
3,6216,86580,6216,0,E41.91 334,6216,6216,6,230
4,6243,86582,6243,0,4N1C.69097,6243,6243,6,231
...,...,...,...,...,...,...,...,...,...
189,85785,88695,85785,0,418167,85785,85785,6,3177
190,85893,88699,85893,0,418945286,85893,85893,6,3181
191,85974,88708,85974,0,4190362,85974,85974,6,3184
192,86190,88711,86190,0,4190271,86190,86190,6,3192


In [71]:
cv[cv[2].isin([-1])].merge(cv[cv[2].isin([0])],left_on=1,right_on=1).merge(cellv[cellv[1]==13],left_on=1,right_on=0)

Unnamed: 0,1,0_x,1_x,2_x,3_x,4_x,0_y,2_y,3_y,4_y,0,1_y,2
0,283,283,283,-1,0.60,-1,86434,0,0.6,283,283,13,10
1,472,472,472,-1,1.50,-1,86443,0,1.5,472,472,13,17
2,985,985,985,-1,0.50,-1,86453,0,0.5,985,985,13,36
3,1012,1012,1012,-1,1.20,-1,86454,0,1.2,1012,1012,13,37
4,1066,1066,1066,-1,1.10,-1,86456,0,1.1,1066,1066,13,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,85414,85414,85414,-1,1,-1,88687,0,1.0,85414,85414,13,3163
292,85441,85441,85441,-1,2,-1,88688,0,2.0,85441,85441,13,3164
293,85846,85846,85846,-1,1,-1,88698,0,1.0,85846,85846,13,3179
294,85927,85927,85927,-1,1,-1,88704,0,1.0,85927,85927,13,3182


In [72]:
cellv[cellv[0]==3624]

Unnamed: 0,0,1,2
3624,3624,6,134


In [73]:
test_cv = cv[cv[2]==0]
test_cv[test_cv[1]==3624]
test_cv[test_cv[0]==89348]
cv[cv[1]==3624]

Unnamed: 0,0,1,2,3,4
3624,3624,3624,-1,41.92748,-1
86509,86509,3624,0,E41.9A2748,3624
89464,89464,3624,4,41.92748,86509


In [74]:
cellv[cellv[1]==6].shape

(3201, 3)

In [75]:
pd.DataFrame(dcm1.column_position)
cellv[cellv[1]==6]

Unnamed: 0,0,1,2
6,6,6,0
33,33,6,1
60,60,6,2
87,87,6,3
114,114,6,4
...,...,...,...
86298,86298,6,3196
86325,86325,6,3197
86352,86352,6,3198
86379,86379,6,3199


In [76]:
super_ori_pd

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,license,id_flag,host_id_flag,neighbourhood_flag,latitude_flag,longitude_flag,minimum_nights_flag,number_of_reviews_flag,last_review_flag,room_type_flag
0,25879,2/1 One Block to Fullerton L Red Line Deck & ...,101521,Red,,Lincoln Park,41.92693,-87.65753,Entire home/apt,94,...,City registration pending,0,0,0,0,0,0,0,0,0
1,37738,Andersonville - Perfect location!,162364,Mat And Randy,,Uptown,41.9729,-87.66538,Private room,110,...,R20000059426,0,0,0,0,0,0,0,0,0
2,189821,"Best in Chicago, private, amazing garden space",899757,Meighan,,Logan Square,41.92918,-87.70219,Entire home/apt,236,...,R21000062936,0,0,0,0,0,0,0,0,0
3,207218,Historic Pullman Artist Flat - Artists & Explo...,1019125,Jb,,Pullman,41.6883,-87.60892,Entire home/apt,100,...,R21000073121,0,0,0,0,0,0,0,0,0
4,220333,Pullman School House Apartment - monthly rental,1019125,Jb,,Pullman,41.68815,-87.60918,Entire home/apt,100,...,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196,782628636832878491,"Steps to Shop, Eat, Train | Easy Access | Zencity",47172572,Zencity,,West Town,41.8955,-87.66124,Entire home/apt,50,...,R19000043484,0,0,0,0,0,0,0,0,0
3197,782643895516370805,Old Town Oasis,169297663,William,,Near North Side,41.90105,-87.63716,Entire home/apt,120,...,R22000093645,0,0,0,0,0,0,0,0,0
3198,784994899201350568,Lovely 1 bed Apt in River North,52827024,Yakir,,Near West Side,41.88822006416301,-87.64145321718578,Entire home/apt,84,...,,0,0,0,0,0,0,0,0,0
3199,785423932330914663,"River North 1br w/ gym, pool & roof, nr Riverwalk",107434423,Blueground,,Near North Side,41.890516,-87.635955,Entire home/apt,169,...,,0,0,0,0,0,0,0,0,0


In [77]:
duplicated_index = list(super_ori_pd[super_ori_pd.duplicated(["id"])].index)

In [78]:
cv = pd.DataFrame(dcm1.cell_values)
test_cv = cv[cv[2]==4].merge(cv[cv[2]==-1],left_on=1,right_on=1)
test_cv[test_cv["3_x"].apply(lambda x:float(x))==test_cv["3_y"].apply(lambda x:float(x))]

Unnamed: 0,0_x,1,2_x,3_x,4_x,0_y,2_y,3_y,4_y
0,89464,3624,4,41.92748,86509,3624,-1,41.92748,-1
1,89465,5082,4,41.93203,86548,5082,-1,41.93203,-1
2,89466,5163,4,41.93123,86551,5163,-1,41.93123,-1
3,89467,6216,4,41.91334,86580,6216,-1,41.91334,-1
4,89468,6243,4,41.69097,86582,6243,-1,41.69097,-1
...,...,...,...,...,...,...,...,...,...
79,89543,37428,4,41.955908,87410,37428,-1,41.9559085,-1
80,89544,37455,4,42.00246,87411,37455,-1,42.00246,-1
81,89545,37779,4,41.881121,87418,37779,-1,41.8811213,-1
82,89546,38049,4,41.84067,87430,38049,-1,41.84067,-1


In [79]:
test_cv[test_cv["3_x"].apply(lambda x:float(x))!=test_cv["3_y"].apply(lambda x:float(x))]

Unnamed: 0,0_x,1,2_x,3_x,4_x,0_y,2_y,3_y,4_y
32,89496,14493,4,2,14493,14493,-1,0,-1


In [80]:
check_pd

Unnamed: 0,1_xx,0_x,1_x,2_x,3_xx,4_xx,0_y,1_y,2_y,0_zz,1_z,2_zz,3_z,4_z,5,0_k,1,2_k,3,4
0,4699,86536,4699,0,,4699,4699,1,174,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
1,4699,86536,4699,0,,4699,4699,1,174,1,1,-1,name,0,-1,0,perturbed_dataset,0,-1,0.0
2,5563,86558,5563,0,,5563,5563,1,206,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
3,5563,86558,5563,0,,5563,5563,1,206,1,1,-1,name,0,-1,0,perturbed_dataset,0,-1,0.0
4,6913,86607,6913,0,,6913,6913,1,256,1,1,-1,name,0,-1,0,{'op': 'initial'},-1,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8202,85994,91397,85994,15,1,85994,85994,26,3184,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8203,86264,91398,86264,15,1,86264,86264,26,3194,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8204,86291,91399,86291,15,1,86291,86291,26,3195,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0
8205,86345,91400,86345,15,1,86345,86345,26,3197,26,26,-1,room_type_flag,25,-1,15,clean_room_type_c2,15,14,2.0


In [81]:
check_pd = check_pd[check_pd["2_y"].apply(lambda x:x not in duplicated_index)]
check_pd = check_pd[check_pd["3_xx"]!=""]
check_pd = check_pd[check_pd["3_xx"]!=""]

In [83]:
# change report
check_pd[check_pd[4].isin([0,1,2,3])].drop_duplicates().groupby([4,1,"3_z"]).count()[["1_xx"]] #.to_csv("test.csv")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1_xx
4,1,3_z,Unnamed: 3_level_1
0.0,perturbed_dataset,host_name,149
0.0,perturbed_dataset,last_review,126
0.0,perturbed_dataset,latitude,161
0.0,perturbed_dataset,longitude,160
0.0,perturbed_dataset,minimum_nights,53
0.0,perturbed_dataset,neighbourhood,636
0.0,perturbed_dataset,number_of_reviews,84
0.0,perturbed_dataset,reviews_per_month,285
0.0,perturbed_dataset,room_type,370
1.0,clean_inconsistent_host_id_c1,host_id_flag,337


In [84]:
per_test = check_pd[check_pd[4].isin([0,1,2,3])]
per_test[(per_test["3_z"]=="room_type")&(per_test[1]=="perturbed_dataset")&(per_test["2_y"]<1486)].shape

(178, 20)

In [85]:
per_test = check_pd[check_pd[4].isin([0,1,2,3])]
per_test[(per_test["3_z"]=="room_type")&(per_test[1]=="perturbed_dataset")&(per_test["2_y"]>=1486)].shape

(192, 20)

In [86]:
per_test[(per_test["3_z"]=="room_type")&(per_test[1]=="perturbed_dataset")].\
    merge(per_test[(per_test[1]=="clean_room_type_c1")],left_on="2_y",right_on="2_y").groupby("3_z_y").count()

Unnamed: 0_level_0,1_xx_x,0_x_x,1_x_x,2_x_x,3_xx_x,4_xx_x,0_y_x,1_y_x,2_y,0_zz_x,...,0_zz_y,1_z_y,2_zz_y,4_z_y,5_y,0_k_y,1_y,2_k_y,3_y,4_y
3_z_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
room_type,178,178,178,178,178,178,178,178,178,178,...,178,178,178,178,178,178,178,178,178,178


In [87]:
per_test[(per_test["3_z"]=="room_type")&(per_test[1]=="perturbed_dataset")].\
    merge(per_test[(per_test[1]=="clean_room_type_c2")],left_on="2_y",right_on="2_y").groupby("3_z_y").count()

Unnamed: 0_level_0,1_xx_x,0_x_x,1_x_x,2_x_x,3_xx_x,4_xx_x,0_y_x,1_y_x,2_y,0_zz_x,...,0_zz_y,1_z_y,2_zz_y,4_z_y,5_y,0_k_y,1_y,2_k_y,3_y,4_y
3_z_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
room_type_flag,192,192,192,192,192,192,192,192,192,192,...,192,192,192,192,192,192,192,192,192,192


In [88]:
tper = per_test[(per_test["3_z"]=="latitude")&(per_test["2_y"]>=1486)]
tper[~tper["2_y"].isin(per_test[per_test[1]=="clean_latitude_c2"]["2_y"].values)]


Unnamed: 0,1_xx,0_x,1_x,2_x,3_xx,4_xx,0_y,1_y,2_y,0_zz,1_z,2_zz,3_z,4_z,5,0_k,1,2_k,3,4
1947,45879,87634,45879,0,41.91298675537109,45879,45879,6,1699,6,6,-1,latitude,5,-1,0,perturbed_dataset,0,-1,0.0
2071,76335,88444,76335,0,41.71286010742188,76335,76335,6,2827,6,6,-1,latitude,5,-1,0,perturbed_dataset,0,-1,0.0
2111,85326,88684,85326,0,41.82881576674946,85326,85326,6,3160,6,6,-1,latitude,5,-1,0,perturbed_dataset,0,-1,0.0


In [89]:
per_test[(per_test["3_z"]=="latitude")&(per_test[1]=="perturbed_dataset")].\
    merge(per_test[(per_test[1]!="perturbed_dataset")],left_on="2_y",right_on="2_y").groupby(["3_z_y","1_y"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,1_xx_x,0_x_x,1_x_x,2_x_x,3_xx_x,4_xx_x,0_y_x,1_y_x,2_y,0_zz_x,...,1_y_y,0_zz_y,1_z_y,2_zz_y,4_z_y,5_y,0_k_y,2_k_y,3_y,4_y
3_z_y,1_y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
host_id_flag,clean_inconsistent_host_id_c1,22,22,22,22,22,22,22,22,22,22,...,22,22,22,22,22,22,22,22,22,22
host_id_flag,clean_inconsistent_host_id_c2,13,13,13,13,13,13,13,13,13,13,...,13,13,13,13,13,13,13,13,13,13
id_flag,clean_duplicate_id_c2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
last_review,clean_last_review_c1,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
last_review,clean_last_review_c2,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
last_review_flag,clean_last_review_c1,19,19,19,19,19,19,19,19,19,19,...,19,19,19,19,19,19,19,19,19,19
last_review_flag,clean_last_review_c2,26,26,26,26,26,26,26,26,26,26,...,26,26,26,26,26,26,26,26,26,26
latitude,clean_latitude_c1,73,73,73,73,73,73,73,73,73,73,...,73,73,73,73,73,73,73,73,73,73
latitude,clean_latitude_c3,85,85,85,85,85,85,85,85,85,85,...,85,85,85,85,85,85,85,85,85,85
latitude_flag,clean_latitude_c2,85,85,85,85,85,85,85,85,85,85,...,85,85,85,85,85,85,85,85,85,85


In [97]:
per_test = check_pd[check_pd[4].isin([0,1,2,3])]
per_test[(per_test["3_z"]=="neighbourhood_flag")&(per_test[1]=="clean_neighbourhood_c1")]

Unnamed: 0,1_xx,0_x,1_x,2_x,3_xx,4_xx,0_y,1_y,2_y,0_zz,1_z,2_zz,3_z,4_z,5,0_k,1,2_k,3,4
4900,11144,89233,11144,3,2,11144,11144,20,412,20,20,-1,neighbourhood_flag,19,-1,3,clean_neighbourhood_c1,3,2,1.0
4901,16112,89268,16112,3,2,16112,16112,20,596,20,20,-1,neighbourhood_flag,19,-1,3,clean_neighbourhood_c1,3,2,1.0
4902,17381,89279,17381,3,2,17381,17381,20,643,20,20,-1,neighbourhood_flag,19,-1,3,clean_neighbourhood_c1,3,2,1.0


In [98]:
per_test = check_pd[check_pd[4].isin([0,1,2,3])]
per_test[(per_test["3_z"]=="latitude_flag")&(per_test[1]=="clean_latitude_c3")]

Unnamed: 0,1_xx,0_x,1_x,2_x,3_xx,4_xx,0_y,1_y,2_y,0_zz,1_z,2_zz,3_z,4_z,5,0_k,1,2_k,3,4
5459,51402,91956,51402,19,1,91015,51402,21,1903,21,21,-1,latitude_flag,20,-1,19,clean_latitude_c3,19,0,3.0
5460,56073,91963,56073,19,1,91022,56073,21,2076,21,21,-1,latitude_flag,20,-1,19,clean_latitude_c3,19,0,3.0
5462,68682,91985,68682,19,1,91044,68682,21,2543,21,21,-1,latitude_flag,20,-1,19,clean_latitude_c3,19,0,3.0
5466,84693,92021,84693,19,1,91080,84693,21,3136,21,21,-1,latitude_flag,20,-1,19,clean_latitude_c3,19,0,3.0


In [100]:
# write datalog facts
with open("dcm_3.pl","w") as file:
    for x in set(dcm1.column):
        file.write('column("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in set(dcm1.row):
        file.write('row("{}").\n'.format('","'.join([str(y) for y in x])))   
    for x in dcm1.cell:
        file.write('cell("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in dcm1.cell_values:
        file.write('cell_values("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in dcm1.col_dependency:
        file.write('col_dependency("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in dcm1.value_derived_from:
        file.write('value_derived_from("{}").\n'.format('","'.join([str(y) for y in x])))
    for x in dcm1.column_position:
        file.write('column_position("{}").\n'.format('","'.join([str(y) for y in x])))

In [101]:
#save dcmx in pickle
import pickle
with open("dcm_collab_c1_c2_c3.pickle","wb") as file:
    pickle.dump(dcm1,file)