<a href="https://colab.research.google.com/github/lohith1266/oasis-infobyte-intership-Data-Anallytics/blob/main/task_3_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np

# Step 2: Load Dataset
df = pd.read_csv("/content/AB_NYC_2019.csv")

# Step 3: Explore Dataset
print("Shape:", df.shape)
print(df.info())
display(df.head())

# -----------------------------
# Data Cleaning Steps
# -----------------------------

# 1. Handle Missing Data
print("\nMissing values before cleaning:\n", df.isnull().sum())
df['reviews_per_month'].fillna(0, inplace=True)   # Fill NaN with 0
df.dropna(subset=['name', 'host_name'], inplace=True)  # Drop rows missing key info

# 2. Remove Duplicates
df.drop_duplicates(inplace=True)

# 3. Standardize Data (e.g., lowercase neighborhood names)
df['neighbourhood_group'] = df['neighbourhood_group'].str.strip().str.lower()
df['neighbourhood'] = df['neighbourhood'].str.strip().str.lower()

# 4. Handle Outliers (example: remove listings with unrealistic price)
df = df[df['price'] < 1000]   # remove extreme outliers above $1000

# 5. Ensure Data Integrity (convert datatypes if needed)
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

# -----------------------------
# Final Dataset
# -----------------------------
print("\nAfter cleaning:")
print("Shape:", df.shape)
print(df.isnull().sum())
display(df.describe())

# Save cleaned dataset
df.to_csv("/content/AB_NYC_2019_cleaned.csv", index=False)
print("\nCleaned dataset saved!")

Shape: (48895, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review  

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0



Missing values before cleaning:
 id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

After cleaning:
Shape: (48560, 16)
id                                   0
name                                 0
host_id                              0
host_name                            0
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
long

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['reviews_per_month'].fillna(0, inplace=True)   # Fill NaN with 0


Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
count,48560.0,48560.0,48560.0,48560.0,48560.0,48560.0,48560.0,38684,48560.0,48560.0,48560.0
mean,19027950.0,67647060.0,40.728916,-73.952026,140.247529,6.957352,23.37397,2018-10-04 19:44:42.783579648,1.095641,7.172446,112.387768
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,2011-03-28 00:00:00,0.0,1.0,0.0
25%,9479342.0,7831209.0,40.689997,-73.98294,69.0,1.0,1.0,2018-07-10 00:00:00,0.04,1.0,0.0
50%,19691690.0,30844240.0,40.72295,-73.95557,105.0,3.0,5.0,2019-05-19 00:00:00,0.38,1.0,44.0
75%,29151730.0,107434400.0,40.763112,-73.9361,175.0,5.0,24.0,2019-06-23 00:00:00,1.6,2.0,225.0
max,36487240.0,274321300.0,40.91306,-73.71299,999.0,1250.0,629.0,2019-07-08 00:00:00,58.5,327.0,365.0
std,10977470.0,78614440.0,0.054576,0.046157,112.923735,19.775276,44.650274,,1.599941,33.054587,131.367133



Cleaned dataset saved!
