In [2]:
# import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [4]:
# Create path 
path = r'/Users/milenafagandini/Desktop/Airbnb /03 Scripts '

In [8]:
df = pd.read_csv(os.path.join(path, 'cleaned2_airbnb_data.csv'), index_col = False)

In [18]:

# Remove columns with 'Unnamed' in the header
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Display initial info
print("Initial data shape (after dropping unnamed columns):", df.shape)
print(df.head())

# --- Data Cleaning ---

# Remove duplicate rows
df_clean = df.drop_duplicates()
print("Data shape after dropping duplicates:", df_clean.shape)

# Fill missing values in the 'price' column with its mean
if 'price' in df_clean.columns:
    mean_price = df_clean['price'].mean()
    df_clean['price'].fillna(mean_price, inplace=True)
    print("Filled missing 'price' values with mean:", mean_price)
else:
    print("'price' column not found in the dataset.")

# --- Outlier Removal on 'price' column only using IQR method ---

if 'price' in df_clean.columns:
    Q1 = df_clean['price'].quantile(0.25)
    Q3 = df_clean['price'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    initial_shape = df_clean.shape[0]
    df_clean = df_clean[(df_clean['price'] >= lower_bound) & (df_clean['price'] <= upper_bound)]
    print(f"After removing outliers in 'price': {initial_shape} -> {df_clean.shape[0]} rows")
else:
    print("Skipping outlier removal because 'price' column is not present.")

# Final shape of cleaned data
print("Final data shape after cleaning and outlier removal:", df_clean.shape)

# Save the cleaned DataFrame to a new CSV file
df_clean.to_csv('cleaned_airbnb_data_t.csv', index=False)

Initial data shape (after dropping unnamed columns): (35172, 14)
      id                                              name   host_id  \
0  11508                Amazing Luxurious Apt-Palermo Soho     42762   
1  14222      RELAX IN HAPPY HOUSE - PALERMO, BUENOS AIRES  87710233   
2  15074                             ROOM WITH RIVER SIGHT     59338   
3  16695                         DUPLEX LOFT 2 - SAN TELMO     64880   
4  20062  PENTHOUSE /Terrace & pool /City views /2bedrooms     75891   

       host_name neighbourhood   latitude  longitude        room_type  \
0        Candela       palermo -34.581840 -58.424150  Entire home/apt   
1         Mar√≠a       palermo -34.586170 -58.410360  Entire home/apt   
2         Monica        nu√±ez -34.538920 -58.465990     Private room   
3  Elbio Mariano     monserrat -34.614390 -58.376110  Entire home/apt   
4         Sergio       palermo -34.581848 -58.441605  Entire home/apt   

      price  minimum_nights  number_of_reviews  \
0   67518.0  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['price'].fillna(mean_price, inplace=True)


In [20]:
import pandas as pd
import csv

# File path
file_path = 'cleaned_airbnb_data_t.csv'

# --- 1. Basic Loading with Pandas ---
try:
    df = pd.read_csv(file_path)
    print("Pandas successfully loaded the file.")
    print("Data shape:", df.shape)
    print("Column names:", df.columns.tolist())
    print("\nSample Data:")
    print(df.head())
except Exception as e:
    print("Error loading CSV with pandas:", e)

# --- 2. Check for Unnamed Columns ---
unnamed_cols = [col for col in df.columns if col.startswith('Unnamed')]
if unnamed_cols:
    print("\nFound unnamed columns:", unnamed_cols)
else:
    print("\nNo unnamed columns found.")

# --- 3. Check for Row Consistency with csv Module ---
try:
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        row_lengths = [len(row) for row in reader]
    
    unique_lengths = set(row_lengths)
    print("\nUnique number of columns per row:", unique_lengths)
    if len(unique_lengths) > 1:
        print("Warning: Not all rows have the same number of columns. This could be causing import issues.")
    else:
        print("All rows appear to have the same number of columns.")
except Exception as e:
    print("Error reading CSV with csv module:", e)

# --- 4. Optional: Inspect first few rows manually from raw CSV ---
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    print("\nFirst 5 lines from the raw CSV file:")
    for line in lines[:5]:
        print(line.strip())
except Exception as e:
    print("Error reading file lines:", e)


Pandas successfully loaded the file.
Data shape: (31913, 14)
Column names: ['id', 'name', 'host_id', 'host_name', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm']

Sample Data:
      id                                          name   host_id  \
0  11508            Amazing Luxurious Apt-Palermo Soho     42762   
1  14222  RELAX IN HAPPY HOUSE - PALERMO, BUENOS AIRES  87710233   
2  15074                         ROOM WITH RIVER SIGHT     59338   
3  16695                     DUPLEX LOFT 2 - SAN TELMO     64880   
4  20429             Hermoso  y acogedor departamento.     77065   

       host_name neighbourhood  latitude  longitude        room_type    price  \
0        Candela       palermo -34.58184  -58.42415  Entire home/apt  67518.0   
1         Mar√≠a       palermo -34.58617  -58.41036  Entire home/apt  22375.0   
2         Monica        nu√±ez -34.538

In [22]:
# Load your CSV file
df = pd.read_csv('cleaned_airbnb_data_t.csv')

# Save the file with UTF-8 BOM using 'utf-8-sig' encoding
df.to_csv('cleaned_airbnb_data_utf8bom.csv', index=False, encoding='utf-8-sig')