##### Utils Functions

In [128]:
def get_feature_types_stats(df):
    features = df.dtypes.rename('feature_type').reset_index().rename(columns={'index':'feature'})
    return features


def get_feature_stats(df):
    missing_stats = df.isna().sum().rename('missing_count').reset_index().rename(columns={'index':'feature'})
    dont_know = df[df == 998].notna().sum().rename('dont_know_count').reset_index().rename(columns={'index':'feature'})
    no_response = df[df == -9998].notna().sum().rename('no_response_count').reset_index().rename(columns={'index':'feature'})
    technological_error = df[df == 9999].notna().sum().rename('technological_error_count').reset_index().rename(columns={'index':'feature'})
    prefer_not_to_answer = df[df == 999].notna().sum().rename('prefer_not_to_answer_count').reset_index().rename(columns={'index':'feature'})
    not_required = df[df == 995].notna().sum().rename('not_required_count').reset_index().rename(columns={'index':'feature'})
    
    missing_stats['missing_percentage'] = missing_stats['missing_count']/df.shape[0]
    missing_stats['dont_know_percentage'] = dont_know['dont_know_count']/df.shape[0]
    missing_stats['no_response_percentage'] = no_response['no_response_count']/df.shape[0]
    # missing_stats['technological_error_percentage'] = technological_error['technological_error_count']/df.shape[0]
    missing_stats['not_required_percentage'] = not_required['not_required_count']/df.shape[0]
    # missing_stats['prefer_not_to_answer_percentage'] = not_required['prefer_not_to_answer_count']/df.shape[0]
    
    feature_types_df = get_feature_types_stats(df)
    
    missing_stats = missing_stats.merge(feature_types_df,
                         left_on='feature',
                         right_on='feature',
                         how='left')
    missing_stats.sort_values('missing_percentage')
    return missing_stats 


In [94]:
import pandas as pd
#from src.utils.utils import get_feature_stats

In [95]:
data = pd.read_csv('../data/raw/Citywide_Survey.csv')

In [96]:
data.shape

(8286, 165)

In [97]:
data.columns

Index(['hh_id', 'weight', 'cms_zone', 'survey_mode', 'person_id', 'person_num',
       'is_participant', 'num_days', 'num_days_complete', 'num_trips',
       ...
       'race_native_hawaiian', 'race_white', 'race_other',
       'disability_hearing', 'disability_seeing', 'disability_walking',
       'disability_mobility', 'disability_dressing',
       'disability_concentrating', 'disability_none'],
      dtype='object', length=165)

In [98]:
data.dtypes.unique()

array([dtype('int64'), dtype('float64'), dtype('O')], dtype=object)

In [99]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,hh_id,weight,cms_zone,survey_mode,person_id,person_num,is_participant,num_days,num_days_complete,num_trips,num_walk_trips,num_transit_trips,num_bike_trips,num_taxi_trips,num_tnc_trips,num_complete_weekend_days,num_complete_weekdays,first_travel_date,last_travel_date,made_trips,smartphone_type,relationship,age,employment,worker,student,license,vehicle,congestion,drive_cbd_freq,planning_apps,job_type,jobs_count,industry,telework_freq,work_cms_zone,missing_work_location,work_mode,work_mode_own,work_park_location,work_park_pay,work_vehicle_pass_period,work_park_amount_day,work_park_amount_week,work_park_amount_month,work_park_amount_year,work_mode_auto,work_mode_bus,work_mode_rail,work_mode_ferry,work_mode_taxi,work_mode_bike,work_bike_park,work_mode_scooter,work_scooter_park,school_type,school_cms_zone,missing_school_location,kid_accompany,school_mode,school_mode_own,school_park_location,school_vehicle_pass_period,school_park_amount_day,school_park_amount_month,school_park_amount_week,school_park_amount_year,school_mode_auto,school_mode_bus,school_mode_rail,school_mode_ferry,school_access,school_egress,school_mode_taxi,school_mode_bike,school_bike_park,school_mode_scooter,school_scooter_park,bike_freq,bike_num_days,bike_stolen,bike_purpose_errands,bike_purpose_transit,bike_purpose_recreation,bike_purpose_commute,bike_purpose_appointment,bike_purpose_other,no_bike_nyc_bike_lanes,no_bike_nyc_paving,no_bike_nyc_storage,no_bike_nyc_long_trips,no_bike_nyc_showers,no_bike_nyc_modes,no_bike_nyc_unable,no_bike_nyc_other,bike_share_citi_bike,bike_share_jump,bike_share_lime,bike_share_none,bike_share_user,citi_bike_freq,jump_freq,lime_freq,no_bike_share_unfamiliar,no_bike_share_expensive,no_bike_share_neighborhood,no_bike_share_stations,no_bike_share_personal_bike,no_bike_share_docks,no_bike_share_uncomfortable,no_bike_share_other,tnc_use_uber,tnc_use_lyft,tnc_use_via,tnc_use_juno,tnc_use_none,tnc_user,tnc_freq,tnc_purpose,tnc_mode,tnc_mode_auto,tnc_mode_taxi,tnc_mode_bus,tnc_mode_rail,tnc_mode_ferry,tnc_mode_bike,tnc_mode_scooter,car_share_zipcar,car_share_car2go,car_share_enterprise,car_share_other,car_share_none,car_share_user,packages,packages_distance,harassment,harassment_trip,harassment_time,harassment_route,harassment_transit,harassment_mode,harassment_neighborhood,harassment_private_car,harassment_alone,harassment_attention,harassment_defense,harassment_other,harassment_none,gender,education,english_proficiency,ethnicity,race_american_indian,race_asian,race_black,race_native_hawaiian,race_white,race_other,disability_hearing,disability_seeing,disability_walking,disability_mobility,disability_dressing,disability_concentrating,disability_none
0,191546322,0.0,Inner Brooklyn,1,19154632203,3,0,,,,,,,,,,,06/24/2019,06/30/2019,,995,5,9,3,1,0,995,995,995,995,995,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995
1,191546322,0.0,Inner Brooklyn,1,19154632202,2,0,,,,,,,,,,,06/24/2019,06/30/2019,,995,5,8,6,0,0,995,995,995,995,995,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995
2,191546322,212.12729,Inner Brooklyn,1,19154632201,1,1,7.0,7.0,23.0,1.0,3.0,11.0,0.0,0.0,2.0,5.0,06/24/2019,06/30/2019,1.0,1,0,5,6,0,1,1,1,4,7,1,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,10,,1,995,100,6,4,5,,,,,995,995,995,995,995,995,995,995,995,995,995,2,2,0,1,0,1,0,0,0,995,995,995,995,995,995,995,995,0,0,0,1,0,995,995,995,0,0,0,0,1,0,0,0,0,0,0,0,1,0,995,995,995,995,995,995,995,995,995,995,0,0,0,0,1,0,4,2,4,995,995,995,995,995,995,995,995,995,995,995,995,1,5,1,997,0,0,0,1,0,0,0,0,0,0,0,0,1
3,19241657,0.0,Middle Queens,1,1924165703,3,0,,,,,,,,,,,05/29/2019,06/04/2019,,995,3,8,1,1,0,995,995,995,995,995,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995
4,19241657,0.0,Middle Queens,1,1924165702,2,0,,,,,,,,,,,05/29/2019,06/04/2019,,995,3,7,6,0,0,995,995,995,995,995,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,995,,0,995,995,995,995,995,,,,,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995,995


In [100]:
int_features = data.dtypes[data.dtypes == "int64"]

In [101]:
float_features = data.dtypes[data.dtypes == "float64"]

In [102]:
object_features = data.dtypes[data.dtypes == "O"]

In [129]:
feature_stats = get_feature_stats(data) 
feature_stats
# feature_stats[feature_stats["dont_know_percentage"]>0]

Unnamed: 0,feature,missing_count,missing_percentage,dont_know_percentage,no_response_percentage,not_required_percentage,feature_type
0,hh_id,0,0.0,0.0,0.0,0.000000,int64
1,weight,0,0.0,0.0,0.0,0.000000,float64
2,cms_zone,0,0.0,0.0,0.0,0.000000,object
3,survey_mode,0,0.0,0.0,0.0,0.000000,int64
4,person_id,0,0.0,0.0,0.0,0.000000,int64
...,...,...,...,...,...,...,...
160,disability_walking,0,0.0,0.0,0.0,0.596186,int64
161,disability_mobility,0,0.0,0.0,0.0,0.596186,int64
162,disability_dressing,0,0.0,0.0,0.0,0.596186,int64
163,disability_concentrating,0,0.0,0.0,0.0,0.596186,int64
