# WG - Gesucht Data Analysis 
#### Step 1: Data Cleaning 


- Data cleaning involves filtering out premium status ads and removing duplicates. 
- Duplicates are identified by the combination of title, address, and duplicate IDs.


In [21]:
import pandas as pd
import os

# Get the parent directory of the current working directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

apartmentsDataPath = os.path.join(parent_dir, 'WebCrawlerApp/data/output/apartmentsBerlinData.csv')

# Read the CSV file
df = pd.read_csv(apartmentsDataPath, usecols=list(range(0, 45)))

print(f"The length of the entire dataset is: {len(df)}")

The length of the entire dataset is: 16120


In [22]:
from IPython.display import display, HTML

def display_as_table(df):
    df_html = df.to_html()
    scrollable_table = f'<div style="overflow-x:auto;">{df_html}</div>'
    return display(HTML(scrollable_table))

##### Step 1.1: Filter Premium Ads from the dataset

In [23]:
df_without_premium = df[df['premiumstatus'] == False]

print(f"The size of the dataset reduced by Premium ads is: {len(df_without_premium)} \n\nThere are {len(df) - len(df_without_premium)} Premium ads in the dataset.")

The size of the dataset reduced by Premium ads is: 10233 

There are 5887 Premium ads in the dataset.


##### Step 1.2: Filter duplicate IDs from the WG-Gesucht Dataset

In [24]:
# Overwrite the original DataFrame with the one without Premium ads
df = df_without_premium

# Step 1: Find duplicates based on apartmentID
dup_ids = df[df.duplicated(subset=['apartmentID'], keep=False)]['apartmentID'].unique()

# Prepare a new DataFrame for the comparison
comparison_columns = ['apartmentID1', 'apartmentID2', 'same_id', 'title1', 'title2', 'same_title', 'address1', 'address2', 'same_address', 'street1', 'street2', 'same_street', 'zip1', 'zip2', 'same_zip', 'city1', 'city2', 'same_city']
df_comparison = pd.DataFrame(columns=comparison_columns)

# Step 2: Find duplicates based on ID
for apt_id in dup_ids:
    # Filter the DataFrame for the current apartmentID
    dup_df = df[df['apartmentID'] == apt_id]
    
    # Choose the first two entries for comparison
    first, second = dup_df.iloc[0], dup_df.iloc[1]
    
    # Create a new row for the comparison DataFrame
    new_row = {
        'apartmentID1': first['apartmentID'], 
        'apartmentID2': second['apartmentID'], 
        'same_id': True,  # Da sie dieselbe apartmentID haben
        'title1': first['title'], 
        'title2': second['title'], 
        'same_title': first['title'] == second['title'], 
        'address1': first['address'], 
        'address2': second['address'], 
        'same_address': first['address'] == second['address'], 
        'street1': first['street'],
        'street2': second['street'],
        'same_street': first['street'] == second['street'],
        'zip1': first['postcode'],
        'zip2': second['postcode'],
        'same_zip': first['postcode'] == second['postcode'],
        'city1': first['city'],
        'city2': second['city'],
        'same_city': first['city'] == second['city']
    }
    df_comparison = pd.concat([df_comparison, pd.DataFrame([new_row], columns=comparison_columns)])

print(f"Number of duplicates based on apartmentID: {len(df_comparison)}")


Number of duplicates based on apartmentID: 1358


Step 1.2.1: View Same IDs with different titles and same address

In [25]:
df_falsetitle = df_comparison[(df_comparison['same_id'] == True) & (df_comparison['same_title'] == False) & (df_comparison['same_address'] == True)]

print(f"Number of duplicates based on apartmentID with different titles: {len(df_falsetitle)}")

#display_as_table(df_falsetitle) # Here you can output the entire table of the False titles with duplicate ids and addresses

Number of duplicates based on apartmentID with different titles: 229


When you view the table, you can see that the user or owner of the add changed the title over time. 

- Result: Duplicates based on ID and Title will be removed from the dataset. 

In [26]:
df_falseadressAndTitle = df_comparison[(df_comparison['same_id'] == True) & (df_comparison['same_title'] == True) & (df_comparison['same_address'] == False) ]

display_as_table(df_falseadressAndTitle)

len(df_falseadressAndTitle)

Unnamed: 0,apartmentID1,apartmentID2,same_id,title1,title2,same_title,address1,address2,same_address,street1,street2,same_street,zip1,zip2,same_zip,city1,city2,same_city
0,10700289,10700289,True,"WG-Zimmer (14,5 m2) in Berlin Lichtenberg im Weitlingkiez","WG-Zimmer (14,5 m2) in Berlin Lichtenberg im Weitlingkiez",True,Kraetkestraße 3810315 Berlin Friedrichsfelde,Kraetkestraße10315 Berlin Friedrichsfelde,False,Kraetkestraße 38,Kraetkestraße,False,10315.0,10315.0,True,Berlin,Berlin,True
0,10701430,10701430,True,1 Zimmer 20qm in WG mit großem Wohnzimmer in Schöneberg,1 Zimmer 20qm in WG mit großem Wohnzimmer in Schöneberg,True,Ebersstraße10827 Berlin Schöneberg,Ebersstr.10827 Berlin Schöneberg,False,Ebersstraße,Ebersstr.,False,10827.0,10827.0,True,Berlin,Berlin,True
0,10123935,10123935,True,TOP WG ZIMMER Neubezug (hell und freundlich),TOP WG ZIMMER Neubezug (hell und freundlich),True,. .12489 Berlin Adlershof,. ..12489 Berlin Adlershof,False,. .,. ..,False,12489.0,12489.0,True,Berlin,Berlin,True
0,10693079,10693079,True,Großes Zimmer in 2 raumwohnung,Großes Zimmer in 2 raumwohnung,True,gundelfingerstr 2110318 Berlin Karlshorst,gundelfingerstr 2010318 Berlin rummelsburg / karlshorst,False,gundelfingerstr 21,gundelfingerstr 20,False,10318.0,10318.0,True,Berlin,Berlin,True
0,10628077,10628077,True,teilmöbliertes WG Zimmer am Rande von Berlin,teilmöbliertes WG Zimmer am Rande von Berlin,True,Egon-Erwin-Kisch-Straße13059 Berlin Neu-Hohenschönhausen,Egon-Erwin-Kisch-Straße13059 Berlin Wartenberg,False,Egon-Erwin-Kisch-Straße,Egon-Erwin-Kisch-Straße,True,13059.0,13059.0,True,Berlin,Berlin,True
0,9770579,9770579,True,Schönes möbliertes Zimmer im Norden von Berlin kurzfristig zu haben - 40 minutes to get to the center of Berlin,Schönes möbliertes Zimmer im Norden von Berlin kurzfristig zu haben - 40 minutes to get to the center of Berlin,True,Birkholzer Straße 3616341 Berlin Pankow,Alt-Buch 4613125 Berlin Pankow,False,Birkholzer Straße 36,Alt-Buch 46,False,16341.0,13125.0,False,Berlin,Berlin,True
0,8461937,8461937,True,Schönes Zimmer in Weißensee,Schönes Zimmer in Weißensee,True,Gustav-Adolf-Straße13086 Berlin Weißensee,Gustav-Adolf-Straße 146A13086 Berlin Weißensee,False,Gustav-Adolf-Straße,Gustav-Adolf-Straße 146A,False,13086.0,13086.0,True,Berlin,Berlin,True
0,4586304,4586304,True,helles Zimmer in schöner Altbauwohnung,helles Zimmer in schöner Altbauwohnung,True,Albrechtstr.12167 Berlin Steglitz,Albrechtstraße12167 Berlin Steglitz,False,Albrechtstr.,Albrechtstraße,False,12167.0,12167.0,True,Berlin,Berlin,True
0,9945000,9945000,True,Zimmer frei (befristet),Zimmer frei (befristet),True,Scharnweberstr. 12713405 Berlin Reinickendorf,Scharnweberstr. 12713405 Berlin Wedding,False,Scharnweberstr. 127,Scharnweberstr. 127,True,13405.0,13405.0,True,Berlin,Berlin,True
0,9019212,9019212,True,Studio im Reuterkiez Kreuzkölln,Studio im Reuterkiez Kreuzkölln,True,Reuterplatz 112047 Berlin Neukölln,Reuterstrasse 2812047 Berlin Neukölln,False,Reuterplatz 1,Reuterstrasse 28,False,12047.0,12047.0,True,Berlin,Berlin,True


18

In [None]:
df_falseadress = df_comparison[(df_comparison['same_id'] == True) & (df_comparison['same_title'] == True) & (df_comparison['same_address'] == False)]

df_falseadress