**Cleaned Citi Bike Data**

In [29]:
import pandas as pd

In [30]:
# Reload the data with low_memory=False to suppress the DtypeWarning
file_paths = ['Resourecs citibike data/2023_data.csv']

# Initialize an empty list to store DataFrames
dfs = []

# Load each CSV file into a DataFrame and append to the list
for file_path in file_paths:
    df = pd.read_csv(file_path, low_memory=False)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
data_combined = pd.concat(dfs, ignore_index=True)

# Convert date columns to datetime format with error handling
data_combined['started_at'] = pd.to_datetime(data_combined['started_at'], errors='coerce')
data_combined['ended_at'] = pd.to_datetime(data_combined['ended_at'], errors='coerce')

# Drop rows with NaT in date columns
data_combined = data_combined.dropna(subset=['started_at', 'ended_at'])

# Calculate trip duration in minutes
data_combined['trip_duration'] = (data_combined['ended_at'] - data_combined['started_at']).dt.total_seconds() / 60

# Extract day, month, year, and hour
data_combined['start_day'] = data_combined['started_at'].dt.day
data_combined['start_month'] = data_combined['started_at'].dt.month
data_combined['start_year'] = data_combined['started_at'].dt.year
data_combined['start_hour'] = data_combined['started_at'].dt.hour

# Display cleaned data
data_combined.head()

# Save the combined and cleaned DataFrame to a CSV file
cleaned_file_path = 'Resourecs citibike data/cleaned_citi_bike_data_combined.csv'
data_combined.to_csv(cleaned_file_path, index=False)

cleaned_file_path

'Resourecs citibike data/cleaned_citi_bike_data_combined.csv'

In [33]:
# Calculate trip duration in minutes (if not already calculated)
cleaned_data['trip_duration'] = (pd.to_datetime(cleaned_data['ended_at']) - pd.to_datetime(cleaned_data['started_at'])).dt.total_seconds() / 60

# Filter out negative and extremely high values in trip duration
cleaned_data = cleaned_data[(cleaned_data['trip_duration'] > 0) & (cleaned_data['trip_duration'] <= 120)]


'Resourecs citibike data/cleaned_citi_bike_data_combined.csv'

In [35]:
import pandas as pd
import numpy as np

# Define a function to calculate distance using the Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) 
    r = 3956  # Radius of Earth in miles. Use 6371 for kilometers
    return c * r

# Apply the function to calculate distance
cleaned_data['distance_miles'] = cleaned_data.apply(lambda row: haversine(row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng']), axis=1)

# Save the cleaned data to a new CSV file
cleaned_file_path = 'Resourecs citibike data/cleaned_citi_bike_data_combined.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

cleaned_file_path


'Resourecs citibike data/cleaned_citi_bike_data_combined.csv'