In [5]:
import requests
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from fuzzywuzzy import process


# List of raw URLs to the .pickle files
urls = [
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/distance_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/flights_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/flights_df_all_fields.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/geos_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/dcb2607fc164695b682e18bebaf35923e127094f/data/pickled/msa_df.pickle',
    'https://github.com/mamaOcoder/highspeedrail/raw/88761104ba44f569e5fadde614eb5ed6a54900ba/data/pickled/tti_df.pickle'
]

loaded_data = []

for url in urls:
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Load the content of the file
        data = pickle.loads(response.content)
        loaded_data.append(data)
    else:
        print(f"Failed to retrieve {url}")

In [10]:
# Convert loaded data to DataFrames
distance_df = pd.DataFrame(loaded_data[0])
flights_df = pd.DataFrame(loaded_data[1])
flights_df_all_fields = pd.DataFrame(loaded_data[2])
geos_df = pd.DataFrame(loaded_data[3])
msa_df = pd.DataFrame(loaded_data[4])
tti_df = pd.DataFrame(loaded_data[5])


In [11]:
# Inspect the column names of tti_df
print(tti_df.columns)

Index(['AreaGroup', 'MetroArea', 'StateCode', 'PopulationGroup', 'Year',
       'Population_thousands', 'PopulationRank', 'AutoCommuters',
       'FreewayDailyVehicle_miles_thousands',
       'ArterialStreetDailyVehicle_miles_thousands', 'ValueOfTime',
       'CommercialValueOfTime', 'AverageStateGasCost',
       'AverageStateDieselCost', 'CongestedTravel', 'CongestedSystem',
       'NumberOfRushHours', 'TotalGallons_thousands', 'TotalGallonsRank',
       'GallonsPerAutoCommuter', 'GallonsPerAutoCommuterRank', 'TotalDelay',
       'TotalDelayRank', 'DelayPerAutoCommuter', 'DelayPerAutoCommuterRank',
       'TravelTimeIndexValue', 'TravelTimeIndexRank',
       'CommuterStressIndexValue', 'CommuterStressIndexRank',
       'FreewayPlanningTimeIndexValue', 'FreewayPlanningTimeIndexRank',
       'AnnualCongestionCostTotalDollars_millions', 'AnnualCongestionCostRank',
       'AnnualCongestionCostPerAutoCommuter',
       'AnnualCongestionCostPerAutoCommuterRank', 'TruckTotalDelay_thousands',


In [12]:
# The correct column name is 'MetroArea'
city_column_name = 'MetroArea'

# Function to get best match for city names
def get_best_match(city, choices):
    match, score = process.extractOne(city, choices)
    return match if score > 80 else None  # Adjust the threshold as needed

# Create a set of all city names from msa_df and distance_df
all_city_names = set(msa_df['MainCity']).union(set(distance_df['Origin']), set(distance_df['Destination']))

# Apply fuzzy matching to align TTI city names with the city names in other datasets
tti_df['Matched_City'] = tti_df[city_column_name].apply(lambda x: get_best_match(x, all_city_names))

# Convert TTI DataFrame to a dictionary for quick lookup
tti_data = tti_df.set_index('Matched_City')['TravelTimeIndexValue'].to_dict()


In [13]:
# Define scoring functions
def score_population(pop):
    pop = int(pop.replace(',', '')) if isinstance(pop, str) else pop
    return (pop / max(city_population.values())) * 100

def score_distance(dist):
    if dist <= 150:
        return ((dist / 100) + 1) * 10
    elif 150 < dist <= 300:
        return 25
    elif 300 < dist <= 350:
        return (((500 - dist) / 100) + 0.5) * 10
    elif 350 < dist <= 500:
        return ((500 - dist) / 100) * 10
    else:
        return 0

def score_gdp(gdp):
    return (gdp / max(city_gdp.values())) * 100

def score_congestion(ttis, city):
    if city in ttis:
        return (ttis[city] / max(ttis.values())) * 100
    else:
        return 0

In [14]:
# Merge msa_df with geos_df to get population and GDP for each city
msa_merged_df = pd.merge(msa_df, geos_df, left_on='MainCity', right_on='MainCity')

# Create dictionaries for quick lookup of population and GDP
city_population = msa_merged_df.set_index('MainCity')['Population'].to_dict()
city_gdp = msa_merged_df.set_index('MainCity')['GDP_thousands_dollars'].to_dict()

# Calculate scores for each city pair
city_pairs = distance_df[['Origin', 'Destination', 'Distance_miles']].copy()
city_pairs['Population_Score_Origin'] = city_pairs['Origin'].map(lambda x: score_population(city_population.get(x, 0)))
city_pairs['Population_Score_Destination'] = city_pairs['Destination'].map(lambda x: score_population(city_population.get(x, 0)))
city_pairs['Distance_Score'] = city_pairs['Distance_miles'].map(score_distance)
city_pairs['GDP_Score_Origin'] = city_pairs['Origin'].map(lambda x: score_gdp(city_gdp.get(x, 0)))
city_pairs['GDP_Score_Destination'] = city_pairs['Destination'].map(lambda x: score_gdp(city_gdp.get(x, 0)))
city_pairs['Congestion_Score_Origin'] = city_pairs['Origin'].map(lambda x: score_congestion(tti_data, x))
city_pairs['Congestion_Score_Destination'] = city_pairs['Destination'].map(lambda x: score_congestion(tti_data, x))

# Calculate total and normalized scores
city_pairs['Total_Score'] = city_pairs[['Population_Score_Origin', 'Population_Score_Destination', 'Distance_Score', 'GDP_Score_Origin', 'GDP_Score_Destination', 'Congestion_Score_Origin', 'Congestion_Score_Destination']].sum(axis=1)
city_pairs['Normalized_Score'] = (city_pairs['Total_Score'] / city_pairs['Total_Score'].max()) * 100


In [15]:
# Rank the city pairs by their normalized scores
top_50_city_pairs = city_pairs.nlargest(50, 'Normalized_Score')

# Display the top 50 city pairs
print(top_50_city_pairs[['Origin', 'Destination', 'Distance_miles', 
                         'Population_Score_Origin', 'Population_Score_Destination', 
                         'Distance_Score', 'GDP_Score_Origin', 
                         'GDP_Score_Destination', 'Congestion_Score_Origin', 'Congestion_Score_Destination', 
                         'Total_Score', 'Normalized_Score']])

                   Origin        Destination  Distance_miles  \
25629         Chicago, IL       New York, NY      789.945845   
93337        New York, NY        Chicago, IL      789.789260   
96326        New York, NY     Washington, DC      228.846590   
141429     Washington, DC       New York, NY      224.769153   
17859          Boston, MA       New York, NY      216.211632   
93132        New York, NY         Boston, MA      215.271497   
60369         Houston, TX       New York, NY     1628.373542   
94237        New York, NY        Houston, TX     1626.629353   
6339          Atlanta, GA       New York, NY      864.373664   
92838        New York, NY        Atlanta, GA      873.701064   
24727         Chicago, IL        Houston, TX     1073.605435   
58577         Houston, TX        Chicago, IL     1082.857649   
95920        New York, NY        Seattle, WA     2851.558511   
125929        Seattle, WA       New York, NY     2836.339892   
10159       Baltimore, MD       New York

In [16]:
# Display the top 50 city pairs
print(top_50_city_pairs[['Origin', 'Destination',  
                         'Total_Score', 'Normalized_Score']])

                   Origin        Destination  Total_Score  Normalized_Score
25629         Chicago, IL       New York, NY   382.305203        100.000000
93337        New York, NY        Chicago, IL   382.305203        100.000000
96326        New York, NY     Washington, DC   380.467872         99.519407
141429     Washington, DC       New York, NY   380.467872         99.519407
17859          Boston, MA       New York, NY   367.025985         96.003398
93132        New York, NY         Boston, MA   367.025985         96.003398
60369         Houston, TX       New York, NY   361.862267         94.652718
94237        New York, NY        Houston, TX   361.862267         94.652718
6339          Atlanta, GA       New York, NY   349.250996         91.353974
92838        New York, NY        Atlanta, GA   349.250996         91.353974
24727         Chicago, IL        Houston, TX   344.167470         90.024271
58577         Houston, TX        Chicago, IL   344.167470         90.024271
95920       

In [17]:
for i, df in enumerate(dataframes, start=1):
    print(f"Dataset {i}:")
    print("First few rows:")
    print(df.head(), "\n")  # Display the first few rows
    print("Data Types:")
    print(df.dtypes, "\n")  # Column data types
    print("Descriptive Statistics:")
    print(df.describe(), "\n")  # Basic statistics
    print("Missing Values:")
    print(df.isnull().sum(), "\n")  # Missing values
    print("-" * 50)


NameError: name 'dataframes' is not defined

In [None]:
for i, df in enumerate(dataframes, start=1):
    print(f"Visualizing Dataset {i}:")
    # Histograms for all numerical features
    df.hist(figsize=(10, 8))
    plt.show()

    # Pairplot for relationships and distributions
    if len(df.columns) <= 10:  # Pairplot can be very slow for large datasets
        sns.pairplot(df)
        plt.show()

    # Boxplots for numerical columns to check for outliers
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in num_cols:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot of {col}')
        plt.show()

    print("-" * 100)


In [None]:
for i, df in enumerate(dataframes, start=1):
    # Select only numeric columns for correlation
    numeric_df = df.select_dtypes(include=[np.number])
    if not numeric_df.empty:
        plt.figure(figsize=(10, 8))
        sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
        plt.title(f"Correlation Matrix for Dataset {i}")
        plt.show()
    else:
        print(f"Dataset {i} contains no numeric columns suitable for correlation.")
