# WG - Gesucht Data Analysis 
#### Step 1: Data Cleaning 

- Data cleaning involves filtering out premium status ads and removing duplicates. 
- Duplicates are identified by the combination of title, address, and duplicate IDs.


In [140]:
import pandas as pd
import os

# Get the parent directory of the current working directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

apartmentsDataPath = os.path.join(parent_dir, 'WebCrawlerApp/data/output/apartmentsBerlinData.csv')

# Read the CSV file
df = pd.read_csv(apartmentsDataPath, usecols=list(range(0, 45)))

print(f"The length of the entire dataset is: {len(df)}")

The length of the entire dataset is: 16120


In [141]:
%run './NotebookSetup/Style.ipynb'

##### Step 1.1: Filter Premium Ads from the dataset

In [142]:
df_without_premium = df[df['premiumstatus'] == False]

print(f"The size of the dataset reduced by Premium ads is: {len(df_without_premium)} \n\nThere are {len(df) - len(df_without_premium)} Premium ads in the dataset.")

The size of the dataset reduced by Premium ads is: 10233 

There are 5887 Premium ads in the dataset.


##### Step 1.2: Filter duplicate IDs from the WG-Gesucht Dataset

In [143]:
# Overwrite the original DataFrame with the one without Premium ads
df = df_without_premium

def compare_duplicates (column_name: str): 
    # Step 1: Find duplicates based on apartmentID
    dup_ids = df[df.duplicated(subset=[column_name], keep=False)][column_name].unique()

    # Prepare a new DataFrame for the comparison
    comparison_columns = ['apartmentID1', 'apartmentID2', 'same_id', 'title1', 'title2', 'same_title', 'address1', 'address2', 'same_address', 'street1', 'street2', 'same_street', 'zip1', 'zip2', 'same_zip', 'city1', 'city2', 'same_city']
    df_comparison = pd.DataFrame(columns=comparison_columns)

    # Step 2: Find duplicates based on ID
    for apt_id in dup_ids:
        # Filter the DataFrame for the current apartmentID
        dup_df = df[df[column_name] == apt_id]

        # Ensure there are at least two entries for comparison
        if len(dup_df) < 2:
            print(dup_df)
            continue  
        
        # Choose the first two entries for comparison
        first, second = dup_df.iloc[0], dup_df.iloc[1]
        
        # Create a new row for the comparison DataFrame
        new_row = {
            'apartmentID1': first['apartmentID'], 
            'apartmentID2': second['apartmentID'], 
            'same_id': True,  # Da sie dieselbe apartmentID haben
            'title1': first['title'], 
            'title2': second['title'], 
            'same_title': first['title'] == second['title'], 
            'address1': first['address'], 
            'address2': second['address'], 
            'same_address': first['address'] == second['address'], 
            'street1': first['street'],
            'street2': second['street'],
            'same_street': first['street'] == second['street'],
            'zip1': first['postcode'],
            'zip2': second['postcode'],
            'same_zip': first['postcode'] == second['postcode'],
            'city1': first['city'],
            'city2': second['city'],
            'same_city': first['city'] == second['city']
        }
        df_comparison = pd.concat([df_comparison, pd.DataFrame([new_row], columns=comparison_columns)])

    print(f"Number of duplicates based on {column_name}: {len(df_comparison)}")
    return df_comparison


In [144]:
df_comparison = compare_duplicates('apartmentID')

Number of duplicates based on apartmentID: 1358


Step 1.2.1: View Same IDs and address with different titles

In [145]:
df_FalseTitle = df_comparison[(df_comparison['same_id'] == True) & (df_comparison['same_title'] == False) & (df_comparison['same_address'] == True)]

print(f"""
{len(df_FalseTitle)} duplicates based on apartmentID, that have the same address but different titles.

When you view the table, you can see that the user or owner of the add changed the title over time. 

Result: Duplicates based on ID and Address with false Title will be removed from the dataset. """)

#display_as_table(df_FalseTitle) # Here you can output the entire table of the False titles with duplicate ids and addresses


229 duplicates based on apartmentID, that have the same address but different titles.

When you view the table, you can see that the user or owner of the add changed the title over time. 

Result: Duplicates based on ID and Address with false Title will be removed from the dataset. 


Step 1.2.2: View Same IDs and Title with different address. 

In [146]:
df_FalseAddress = df_comparison[(df_comparison['same_id'] == True) & (df_comparison['same_title'] == True) & (df_comparison['same_address'] == False) ]

print(f"""
{len(df_FalseAddress)} duplicates based on apartmentID, that have the same titles but with different addresses.""")

#display_as_table(df_falseadressAndTitle)


18 duplicates based on apartmentID, that have the same titles but with different addresses.


In [147]:
from Levenshtein import ratio

def compare_streetnames(df_differentstreets):
    data_List = []

    for index, row in df_differentstreets.iterrows():
        data = {'street1' : row['street1'], 'street2' : row['street2']}
        data_List.append(data)

    more_than_80_percent_similar = 0
    between_40_and_80_percent_similar = 0
    less_than_40_percent_similar = 0

    for item in data_List:
        str1, str2 = str(item["street1"]), str(item["street2"])
        similarity = ratio(str1.lower(), str2.lower())  # Similarity as a number between 0 and 1
        if similarity > 0.8:
            more_than_80_percent_similar += 1
        elif similarity >= 0.4 and similarity <= 0.8:
            between_40_and_80_percent_similar += 1
        elif similarity < 0.4:
            less_than_40_percent_similar += 1

    print(f"""
{more_than_80_percent_similar} duplicates based on apartmentID, where the street names are more than 80% similar. --> These are probably the same street with different spellings.
{between_40_and_80_percent_similar} duplicates based on apartmentID, where the street names are between 40% and 80% similar. --> These are probably the same street with minor differences.
{less_than_40_percent_similar} duplicates based on apartmentID, where the street names are less than 40% similar. --> These are probably different streets.""")

    return less_than_40_percent_similar

In [148]:
less_than_40_percent_similar = compare_streetnames(df_FalseAddress)


14 duplicates based on apartmentID, where the street names are more than 80% similar. --> These are probably the same street with different spellings.
3 duplicates based on apartmentID, where the street names are between 40% and 80% similar. --> These are probably the same street with minor differences.
1 duplicates based on apartmentID, where the street names are less than 40% similar. --> These are probably different streets.


In [149]:
print(f""" 
The percentage of duplicates based on apartment ID, where  {((less_than_40_percent_similar / len(df_comparison)) * 100):.4f} % of the street names are less than 40% similar.
This percentage means that there may be ads that are identical in terms of apartmentID, but have different addresses and are therefore not duplicates.
Is this percentage acceptable? If it is less than 1%: {1 > ((less_than_40_percent_similar / len(df_comparison)) * 100)}
""")

if 1 > ((less_than_40_percent_similar / len(df_comparison)) * 100):
    print("The percentage is acceptable. The duplicates based on apartmentID with different addresses will be removed from the dataset.")

assert 1 > ((less_than_40_percent_similar / len(df_comparison)) * 100), "The percentage is not acceptable. The duplicates based on apartmentID with different addresses will not be removed from the dataset."

 
The percentage of duplicates based on apartment ID, where  0.0736 % of the street names are less than 40% similar.
This percentage means that there may be ads that are identical in terms of apartmentID, but have different addresses and are therefore not duplicates.
Is this percentage acceptable? If it is less than 1%: True

The percentage is acceptable. The duplicates based on apartmentID with different addresses will be removed from the dataset.


Step 1.2.3: View Same IDs with False Title and Address

In [150]:
df_AllDifferent = df_comparison[(df_comparison['same_id'] == True) & (df_comparison['same_title'] == False) & (df_comparison['same_address'] == False)]

less_than_40_percent_similar = compare_streetnames(df_AllDifferent)

#display_as_table(df_AllDifferent)


30 duplicates based on apartmentID, where the street names are more than 80% similar. --> These are probably the same street with different spellings.
8 duplicates based on apartmentID, where the street names are between 40% and 80% similar. --> These are probably the same street with minor differences.
4 duplicates based on apartmentID, where the street names are less than 40% similar. --> These are probably different streets.


In [151]:
print(f""" 
The percentage of duplicates based on apartment ID, where  {((less_than_40_percent_similar / len(df_comparison)) * 100):.4f} % of the street names are less than 40% similar.
This percentage means that there may be ads that are identical in terms of apartmentID, but have different addresses and are therefore not duplicates.
Is this percentage acceptable? If it is less than 1%: {1 > ((less_than_40_percent_similar / len(df_comparison)) * 100)}""")

assert 1 > ((less_than_40_percent_similar / len(df_comparison)) * 100), "The percentage of duplicates based on apartment ID, where the street names are less than 40% similar is too high. Please check the data."

 
The percentage of duplicates based on apartment ID, where  0.2946 % of the street names are less than 40% similar.
This percentage means that there may be ads that are identical in terms of apartmentID, but have different addresses and are therefore not duplicates.
Is this percentage acceptable? If it is less than 1%: True


In [152]:
df_AllSame = df_comparison[(df_comparison['same_id'] == True) & (df_comparison['same_title'] == True) & (df_comparison['same_address'] == True)]

print(f"""
{len(df_AllSame)} duplicates based on apartmentID, that have the same titles and the same address.
      
These are the exact duplicates and need to be removed.""")

#display_as_table(df_falseadress)


1069 duplicates based on apartmentID, that have the same titles and the same address.
      
These are the exact duplicates and need to be removed.


In [153]:
assert len(df_comparison) == (len(df_FalseTitle) + len(df_FalseAddress) + len(df_AllSame) + len(df_AllDifferent)) 

##### Step 1.3: Filter duplicate IDs from the WG-Gesucht Dataset

In [154]:
df = df.drop_duplicates(subset='apartmentID')


In [155]:
%store df

def data #Delete the df from the memory 

Empty DataFrame
Columns: [apartmentID, title, room_size, total_rent, premiumstatus, user_name, address, street, postcode, suburb, city, available_from, available_until, online_since, rent, utilities, other_costs, deposit, transfer_agreement_cost, apartment_size, max_roommate, roommate_age, languages, wg_type, smoking_policy, preferred_gender_age, wg_detail1, wg_detail2, house_type, floor, parking_situation, public_transport_reach, furnitured, garden, balcony, electricity_eco_friendly, heating, internet, bathroom, ground_material, object_detail1, object_detail2, required_document1, required_document2, required_document3]
Index: []

[0 rows x 45 columns]
Number of duplicates based on address: 873


Unnamed: 0,apartmentID1,apartmentID2,same_id,title1,title2,same_title,address1,address2,same_address,street1,street2,same_street,zip1,zip2,same_zip,city1,city2,same_city
0,10716109,10835354,True,WG-Zimmer (befristet),"Suchen dritten Mitbewohner/in; >15m²; 450Euro Warm(+plus GEZ und WLAN), Ab April, befristet bis Dezember",False,Trützschlerstraße12487 Berlin Johannisthal,Trützschlerstraße12487 Berlin Johannisthal,True,Trützschlerstraße,Trützschlerstraße,True,12487.0,12487.0,True,Berlin,Berlin,True
0,9736095,10548010,True,Zimmer nähe Uni Adlershof,Zimmer nahe Flughafen Schönefeld,False,Bohnsdorfer Kirchsteig12526 Berlin Bohnsdorf,Bohnsdorfer Kirchsteig12526 Berlin Bohnsdorf,True,Bohnsdorfer Kirchsteig,Bohnsdorfer Kirchsteig,True,12526.0,12526.0,True,Berlin,Berlin,True
0,10171103,10760076,True,Sonniges großes Zimmer in 3er WG in Berlin-Schöneweide,"Schönes , neu renoviertes Zimmer 21 qm, in neu zu gründender 2er WG, vollmöbliert, Hochparterre, Nähe FH für Wirtschaft und Technik",False,Spreestraße 1812439 Berlin Niederschöneweide,Spreestraße 1812439 Berlin Niederschöneweide,True,Spreestraße 18,Spreestraße 18,True,12439.0,12439.0,True,Berlin,Berlin,True
0,8629552,10763358,True,Schöne 1-Zimmer Studio im beliebten Crellekiez in Schöneberg - Pauschalmiete,2 Wochen Zwischenmiete // 35qm (Zimmer + Plattform),False,Crellestraße10827 Berlin Schöneberg,Crellestraße10827 Berlin Schöneberg,True,Crellestraße,Crellestraße,True,10827.0,10827.0,True,Berlin,Berlin,True
0,9765911,10718115,True,Cozy Room in WG in Schöneberg from Jan 10 - Feb 11,Wg Zimmer in Schöneberg für 2 Wochen,False,Ebersstraße10827 Berlin Schöneberg,Ebersstraße10827 Berlin Schöneberg,True,Ebersstraße,Ebersstraße,True,10827.0,10827.0,True,Berlin,Berlin,True
0,8992373,9191174,True,"Nollendorfplatz: 3-Zi-Männer-Single-eher Zweck-WG, Berufstätigen-WG. Bitte meine Fragen BEANTWORTEN!! +FOTO!! senden, sonst KEINE Antwort!! \n rather purpose-MEN-WG/working shared apartment. Please ANSWER my questions!! +PHOTO!! send, otherwise NO answer!!","Nollendorfplatz: 3-Zi-Männer-Single-(eher Zweck) WG/ Bitte Text LESEN + Fragen BEANTWORTEN! + FOTO! sonst KEINE Antwort! KEINE Studenten, sorrry! \n Rather purpose-MEN-WG/Please READ text + ANSWER questions! & PHOTO! otherwise NO answer! NO students!",False,Nähe Nollendorfplatz / KADEWE10777 Berlin Schöneberg,Nähe Nollendorfplatz / KADEWE10777 Berlin Schöneberg,True,Nähe Nollendorfplatz / KADEWE,Nähe Nollendorfplatz / KADEWE,True,10777.0,10777.0,True,Berlin,Berlin,True
0,9771311,9858722,True,URBANELITE.COM // No deposit! (no Kaution) // Berlin Schöneberg prime location // All inclusive furnished luxury room close to Berlin Mitte!!,URBANELITE.COM // No deposit! (no Kaution) // Schöneberg prime location // All inclusive furnished luxury room close to the center of Berlin,False,Gutzkowstraße 410827 Berlin Schöneberg,Gutzkowstraße 410827 Berlin Schöneberg,True,Gutzkowstraße 4,Gutzkowstraße 4,True,10827.0,10827.0,True,Berlin,Berlin,True
0,8505277,10685297,True,1 Zimmer mit Balkon in 3er WG,1-Zimmer Wohnung zur Zwischenmiete für den Januar und Februar,False,Oberschöneweide12459 Berlin Oberschöneweide,Oberschöneweide12459 Berlin Oberschöneweide,True,Oberschöneweide,Oberschöneweide,True,12459.0,12459.0,True,Berlin,Berlin,True
0,7794624,10680549,True,!!!Helles Zimmer mit Balkon HTW nähe!!!,Gemütliche WG in Schöneweide,False,Wilhelminenhofstraße12459 Berlin Oberschöneweide,Wilhelminenhofstraße12459 Berlin Oberschöneweide,True,Wilhelminenhofstraße,Wilhelminenhofstraße,True,12459.0,12459.0,True,Berlin,Berlin,True
0,8159636,10767588,True,Schönes und helles Studentenzimmer zu vermieten.,WG Zimmer Schöneweide (Befristet oder unbefristet) (+7m2 Hochebene),False,Griechische Allee 1612459 Berlin Oberschöneweide,Griechische Allee 1612459 Berlin Oberschöneweide,True,Griechische Allee 16,Griechische Allee 16,True,12459.0,12459.0,True,Berlin,Berlin,True
