In [274]:
import pandas as pd



In [275]:
# Read data for the three cities
nyc_data = pd.read_csv('NYC.csv')
chicago_data = pd.read_csv('Chicago.csv')
washington_data = pd.read_csv('Washington.csv')

In [276]:
# Standardize columns for New York City
nyc_data['city'] = 'New York City'
nyc_data.rename(columns={
    'starttime': 'start_time',
    'stoptime': 'end_time',
    'tripduration': 'duration',
    'usertype': 'user_type',
    'birth year': 'birth_year'
}, inplace=True)

# Standardize columns for Chicago
chicago_data['city'] = 'Chicago'
chicago_data.rename(columns={
    'starttime': 'start_time',
    'stoptime': 'end_time',
    'tripduration': 'duration',
    'usertype': 'user_type',
    'birthyear': 'birth_year'
}, inplace=True)

# Standardize columns for Washington
washington_data['city'] = 'Washington'
washington_data.rename(columns={
    'Start date': 'start_time',
    'End date': 'end_time',
    'Duration (ms)': 'duration',
    'Member Type': 'user_type'
}, inplace=True)


In [277]:

# Add missing columns (gender and birth_year) for Washington
washington_data['gender'] = None
washington_data['birth_year'] = None


In [278]:
washington_data.isnull().sum()

duration                    0
start_time                  0
end_time                    0
Start station number        0
Start station               0
End station number          0
End station                 0
Bike number                 0
user_type                   0
city                        0
gender                  66326
birth_year              66326
dtype: int64

In [279]:
washington_data.dtypes

duration                 int64
start_time              object
end_time                object
Start station number     int64
Start station           object
End station number       int64
End station             object
Bike number             object
user_type               object
city                    object
gender                  object
birth_year              object
dtype: object

In [280]:
washington_data.shape

(66326, 12)

In [281]:
# def confertDate(df):
#     # Convert 'start_time' and 'end_time' to datetime format
#     df['start_time']=pd.to_datetime(df['start_time'])
#     print(df['start_time'].dtypes,'=======\n dataset shape: \n=======',df.shape,'=========\n',df.isnull().sum(),'=======\n\n\n')
#     return df
# washington_data=washington_data.copy()
# washington_data= confertDate(washington_data)
# nyc_data=nyc_data.copy()
# nyc_data= confertDate(nyc_data)
# chicago_data=chicago_data.copy()
# chicago_data= confertDate(chicago_data)

# washington_data['month'] = washington_data['start_time'].dt.month
# washington_data['hour'] = washington_data['start_time'].dt.hour
# washington_data['day'] = washington_data['start_time'].dt.day_name()

# nyc_data['month'] = nyc_data['start_time'].dt.month
# nyc_data['hour'] = nyc_data['start_time'].dt.hour
# nyc_data['day'] = nyc_data['start_time'].dt.day_name()

# chicago_data['month'] = chicago_data['start_time'].dt.month
# chicago_data['hour'] = chicago_data['start_time'].dt.hour
# chicago_data['day'] = chicago_data['start_time'].dt.day_name()


import pandas as pd

def convert_and_extract_datetime_features(df, datetime_column):
    """
    Converts a specified column to datetime format and extracts useful features.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        datetime_column (str): The name of the column to convert to datetime.

    Returns:
        pd.DataFrame: The DataFrame with converted datetime column and new features.
    """
    # Convert the specified column to datetime
    df[datetime_column] = pd.to_datetime(df[datetime_column], errors='coerce')

    # Print column type, dataset shape, and null values summary
    print(
        f"{datetime_column} dtype: {df[datetime_column].dtypes}\n"
        f"Dataset shape: {df.shape}\n"
        f"Null values per column:\n{df.isnull().sum()}\n{'='*30}"
    )

    # Extract additional datetime features
    df['month'] = df[datetime_column].dt.month
    df['hour'] = df[datetime_column].dt.hour
    df['day'] = df[datetime_column].dt.day_name()

    return df

# Process each dataset and apply datetime conversion
for dataset_name, dataset in {"washington_data": washington_data, 
                              "nyc_data": nyc_data, 
                              "chicago_data": chicago_data}.items():
    # Make a copy of the dataset
    locals()[dataset_name] = dataset.copy()

    # Convert datetime and extract features
    locals()[dataset_name] = convert_and_extract_datetime_features(locals()[dataset_name], "start_time")


start_time dtype: datetime64[ns]
Dataset shape: (66326, 12)
Null values per column:
duration                    0
start_time                  0
end_time                    0
Start station number        0
Start station               0
End station number          0
End station                 0
Bike number                 0
user_type                   0
city                        0
gender                  66326
birth_year              66326
dtype: int64
start_time dtype: datetime64[ns]
Dataset shape: (276798, 16)
Null values per column:
duration                       0
start_time                     0
end_time                       0
start station id               0
start station name             0
start station latitude         0
start station longitude        0
end station id                 0
end station name               0
end station latitude           0
end station longitude          0
bikeid                         0
user_type                    717
birth_year                 31

In [282]:
# Drop all-NA columns from each DataFrame
nyc_data = nyc_data.dropna(axis=1, how='all')
chicago_data = chicago_data.dropna(axis=1, how='all')
washington_data = washington_data.dropna(axis=1, how='all')

# Combine data into a single DataFrame
combined_data = pd.concat([nyc_data, chicago_data, washington_data], ignore_index=True)

# Display the first few rows of the combined data
print(combined_data.head())


   duration          start_time           end_time  start station id  \
0       839 2016-01-01 00:09:55  1/1/2016 00:23:54             532.0   
1       686 2016-01-01 00:21:17  1/1/2016 00:32:44            3143.0   
2       315 2016-01-01 00:33:11  1/1/2016 00:38:26            3164.0   
3       739 2016-01-01 00:40:51  1/1/2016 00:53:11             223.0   
4      1253 2016-01-01 00:44:16  1/1/2016 01:05:09             484.0   

       start station name  start station latitude  start station longitude  \
0         S 5 Pl & S 4 St               40.710451               -73.960876   
1         5 Ave & E 78 St               40.776829               -73.963888   
2  Columbus Ave & W 72 St               40.777057               -73.978985   
3         W 13 St & 7 Ave               40.737815               -73.999947   
4         W 44 St & 5 Ave               40.755003               -73.980144   

   end station id          end station name  end station latitude  ...  \
0           401.0   Alle

In [283]:
combined_data.columns

Index(['duration', 'start_time', 'end_time', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'user_type',
       'birth_year', 'gender', 'city', 'month', 'hour', 'day', 'trip_id',
       'from_station_id', 'from_station_name', 'to_station_id',
       'to_station_name', 'Start station number', 'Start station',
       'End station number', 'End station', 'Bike number'],
      dtype='object')

In [284]:
# Save the combined data to a new file
combined_data.to_csv('combined_bike_data.csv', index=False)
