In [None]:
import pandas as pd

In [None]:
df_nyc_311 = pd.read_csv('data/nyc_311_2024_2025_sample.csv', index_col="unique_key")
df_median_rent = pd.read_csv('data/medianAskingRent_All.csv')

In [None]:
print(df_nyc_311.shape)
print(df_median_rent.shape)

In [None]:
df_median_rent.head()

In [None]:
df_nyc_311.head(5)

In [None]:
list_of_relevant_columns = ['created_date', 'closed_date', 'complaint_type',
                            'descriptor', 'status', 'resolution_description',
                            'resolution_action_updated_date', 'borough',
                            'community_board', 'incident_zip', 
                            'incident_address', 'street_name', 'city',
                            'latitude', 'longitude']

df_nyc_311_selected = df_nyc_311[list_of_relevant_columns]

In [None]:
date_columns = [col for col in df_median_rent.columns if col.startswith('2024') or col.startswith('2025')]
df_median_rent_selected = df_median_rent[df_median_rent.columns[:3].to_list() + date_columns]

In [None]:
df_nyc_311_selected.head()

In [None]:
df_nyc_311_selected.describe(include='all')

In [None]:
# Analysis of missing values with percentages
missing_values = df_nyc_311_selected.isna().sum().sort_values(ascending=False)
missing_percentage = (df_nyc_311_selected.isna().sum() / len(df_nyc_311_selected) * 100).sort_values(ascending=False)

# Creating a DataFrame for convenient display
missing_data = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_percentage
})

# Only show columns with empty values
missing_data = missing_data[missing_data['Missing_Count'] > 0]

print(f"Total number of rows in the dataset: {len(df_nyc_311)}")
print("\nAnalysis of empty values:")
print(missing_data.round(2))

In [None]:
print( "Duplicate rows in rent data:",df_median_rent_selected.duplicated().sum())
print( "Duplicate rows in 311 data:",df_nyc_311_selected.duplicated().sum())

In [None]:
df_nyc_311_selected = df_nyc_311_selected.drop_duplicates()
df_nyc_311_selected.shape

In [None]:
df_nyc_311_selected['created_date'] = pd.to_datetime(df_nyc_311_selected['created_date'], errors='coerce')
df_nyc_311_selected['closed_date'] = pd.to_datetime(df_nyc_311_selected['closed_date'], errors='coerce')
df_nyc_311_selected['resolution_action_updated_date'] = pd.to_datetime(df_nyc_311_selected['resolution_action_updated_date'], errors='coerce')
df_nyc_311_selected.shape

In [None]:
df_nyc_311_selected[df_nyc_311_selected['created_date'] > df_nyc_311_selected['closed_date']].shape[0]

In [None]:
df_nyc_311_selected = df_nyc_311_selected[(df_nyc_311_selected['created_date'] <= df_nyc_311_selected['closed_date']) | (df_nyc_311_selected['closed_date'].isna())]
df_nyc_311_selected.shape

In [None]:
unique_counts = df_nyc_311_selected.nunique()
columns_to_check = unique_counts[unique_counts < 1000].index.to_list()

for col in columns_to_check:
    print(f"Value counts for column '{col}':")
    print(df_nyc_311_selected[col].value_counts())
    print("\n")

In [None]:
columns_to_check