In [38]:
import requests
import json
import browsercookie
from datetime import datetime, timedelta
import re
import pandas as pd
import csv
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
import scipy

In [2]:
class UberDriver:
    def __init__(self):
        # Download the cookies file by logging into the Uber Drivers page (https://drivers.uber.com/earnings/activities)
        # Then use this Chrome browser extension to save the cookies for that page to "cookies.txt" in the folder
        # of this code. https://chromewebstore.google.com/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc
        cj = browsercookie.firefox()
        cookies_str = "; ".join([f"{cookie.name}={cookie.value}" for cookie in cj])
        
        # Get this value from the Postman code snippet for the original request, which is formed in Postman
        # via import the cURL copied from the request URL in the Google Chrome Network Developer Console for
        # https://drivers.uber.com/earnings/activities
        self.headers = {
          'accept': '*/*',
          'accept-language': 'en-US,en;q=0.9',
          'content-type': 'application/json',
          'cookie': cookies_str,
          'origin': 'https://drivers.uber.com',
          'priority': 'u=1, i',
          'referer': 'https://drivers.uber.com/earnings/activities',
          'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
          'sec-ch-ua-mobile': '?0',
          'sec-ch-ua-platform': '"macOS"',
          'sec-fetch-dest': 'empty',
          'sec-fetch-mode': 'cors',
          'sec-fetch-site': 'same-origin',
          'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
          'x-csrf-token': 'x',
          'x-uber-earnings-seed': '939a975a5c8f7c135421d56458dcccb5'
        }

    def get_rides(self, start_date_iso, end_date_iso):        
        url = "https://drivers.uber.com/earnings/api/getWebActivityFeed?localeCode=en"
    
        payload = json.dumps({
          "startDateIso": start_date_iso,
          "endDateIso": end_date_iso,
          "paginationOption": {}
        })
        
        response = requests.request("POST", url, headers=self.headers, data=payload)
        data = response.json()

        rides = data['data']['activities']
        while data['data']['pagination']['hasMoreData']:
            payload = json.dumps({
              "startDateIso": start_date_iso,
              "endDateIso": end_date_iso,
              "paginationOption": {"cursor": data['data']['pagination']['nextCursor']}
            })
            response = requests.request("POST", url, headers=self.headers, data=payload)
            data = response.json()
            if data['data']['activities']:
                rides = rides + data['data']['activities']
        if rides is None:
            return []
        else:
            return rides

    def get_ride_detail(self, ride_uuid):
        # This is only helpful to get additional fare breakdown from Uber, if we wanted to analyze how much
        # Uber is taking from each fare.
        url = f"https://drivers.uber.com/earnings/trips/{ride_uuid}"
        response = requests.request("GET", url, headers=self.headers)
        return response.text

In [17]:
uber = UberDriver()

# This is the date I started working as an Uber driver; modify for your start date
start_date = datetime.strptime("2023-01-09", "%Y-%m-%d")
end_date = datetime.today()
current_date = start_date

rides = []
while current_date <= end_date:
    next_date = current_date + timedelta(days=7)
    start_date_iso = current_date.strftime('%Y-%m-%d')
    end_date_iso = next_date.strftime('%Y-%m-%d')
    print(f"Getting rides for {start_date_iso} - {end_date_iso}...")
    new_rides = uber.get_rides(start_date_iso, end_date_iso)
    print(f"Retrieved {len(new_rides)} rides.")
    rides += new_rides
    current_date = next_date

# Let's dump all of these rides to a JSON file so we can reference this data outside of the script if need be, 
# or simply not have to retrieve from Uber again.
with open(f"rides.json", "w") as file:
    json.dump(rides, file)

print(f"Retrieved {len(rides)} rides total.")

Getting rides for 2023-01-09 - 2023-01-16...
Retrieved 42 rides.
Getting rides for 2023-01-16 - 2023-01-23...
Retrieved 9 rides.
Getting rides for 2023-01-23 - 2023-01-30...
Retrieved 13 rides.
Getting rides for 2023-01-30 - 2023-02-06...
Retrieved 0 rides.
Getting rides for 2023-02-06 - 2023-02-13...
Retrieved 26 rides.
Getting rides for 2023-02-13 - 2023-02-20...
Retrieved 26 rides.
Getting rides for 2023-02-20 - 2023-02-27...
Retrieved 7 rides.
Getting rides for 2023-02-27 - 2023-03-06...
Retrieved 29 rides.
Getting rides for 2023-03-06 - 2023-03-13...
Retrieved 9 rides.
Getting rides for 2023-03-13 - 2023-03-20...
Retrieved 51 rides.
Getting rides for 2023-03-20 - 2023-03-27...
Retrieved 59 rides.
Getting rides for 2023-03-27 - 2023-04-03...
Retrieved 26 rides.
Getting rides for 2023-04-03 - 2023-04-10...
Retrieved 0 rides.
Getting rides for 2023-04-10 - 2023-04-17...
Retrieved 0 rides.
Getting rides for 2023-04-17 - 2023-04-24...
Retrieved 0 rides.
Getting rides for 2023-04-24 - 2

In [15]:
# We can re-run this part of the script onward without having to re-run the previous retrieval 
with open(f"rides.json", "r") as file:
    rides = json.load(file)

rides[0]

{'uuid': 'd3096d6c-02bd-4f8e-855b-117588b27910',
 'recognizedAt': 1673809557,
 'activityTitle': 'Comfort',
 'formattedTotal': '$10.72',
 'routing': {'webviewUrl': 'https://drivers.uber.com/earnings/trips/d3096d6c-02bd-4f8e-855b-117588b27910',
  'deeplinkUrl': None},
 'breakdownDetails': {'formattedTip': '$1.00', 'formattedSurge': None},
 'tripMetaData': {'formattedDuration': '15 min 56 sec',
  'formattedDistance': '3.9 mi',
  'pickupAddress': 'N Ashland Ave, Chicago, IL 60614-1101, US',
  'dropOffAddress': 'W Madison St, Chicago, 60612, US',
  'mapUrl': 'https://static-maps.uber.com/map?width=360&height=100&marker=lat%3A41.92946%24lng%3A-87.66817%24icon%3Ahttps%3A%2F%2Fd1a3f4spazzrp4.cloudfront.net%2Fmaps%2Fhelix%2Fcar-pickup-pin.png%24anchorX%3A0.5%24anchorY%3A0.5&marker=lat%3A41.88207%24lng%3A-87.67135%24icon%3Ahttps%3A%2F%2Fd1a3f4spazzrp4.cloudfront.net%2Fmaps%2Fhelix%2Fcar-dropoff-pin.png%24anchorX%3A0.5%24anchorY%3A0.5&polyline=color%3A0xFF2DBAE4%24width%3A4%24enc%3Aaj%7C%7EF%60ua

In [17]:
def parse_time_to_seconds(time_str):
    matches = re.findall(r'(\d+)\s*(hr|min|sec)', time_str)
    unit_to_seconds = {'hr': 3600, 'min': 60, 'sec': 1}
    return sum(int(value) * unit_to_seconds[unit] for value, unit in matches)

def parse_miles(miles_str):
    match = re.search(r'(\d+\.?\d*)\s*mi', miles_str)
    return float(match.group(1))

def parse_currency_to_float(currency_str):
    clean_str = currency_str.replace('$', '').strip()
    return float(clean_str)

def parse_season(date):
    """Return the season for a given datetime object."""
    seasons = {
        'Spring': (3, 21, 6, 20),
        'Summer': (6, 21, 9, 20),
        'Fall': (9, 21, 12, 20),
        'Winter': (12, 21, 3, 20)
    }
    month = date.month
    day = date.day
    for season, (start_month, start_day, end_month, end_day) in seasons.items():
        if start_month <= end_month:
            if start_month <= month <= end_month:
                if (month == start_month and day >= start_day) or (month == end_month and day <= end_day) or (start_month < month < end_month):
                    return season
        else:
            if month > start_month or month < end_month or (month == start_month and day >= start_day) or (month == end_month and day <= end_day):
                return season

def extract_zipcode(address):
    zip_code_pattern = re.compile(r'\b\d{5}\b')
    match = zip_code_pattern.search(address)
    if match:
        return match.group()

cleaned_rides = []
for ride in rides:
    if ride.get('breakdownDetails'):
        tip = ride['breakdownDetails']['formattedTip'] or '$0.00'
        surge = ride['breakdownDetails']['formattedSurge'] or '$0.00'
    else:
        tip = '$0.00'
        surge = '$0.00'
    if ride.get('tripMetaData'):
        duration = parse_time_to_seconds(ride['tripMetaData']['formattedDuration'])
        distance = parse_miles(ride['tripMetaData']['formattedDistance'])
        pickup_address = ride['tripMetaData']['pickupAddress']
        dropoff_address = ride['tripMetaData']['dropOffAddress']
    else:
        duration = None
        distance = None
        pickup_address = None
        dropoff_address = None
    when = datetime.fromtimestamp(ride['recognizedAt'])
    ride_clean = {
        'uuid': ride['uuid'],
        'date': when.strftime('%Y-%m-%d'),
        'time': when.strftime('%H:%M:%S'),
        'day': when.strftime('%A'),
        'day of week': when.weekday(),
        'sortable day of week': f"{when.weekday()} - {when.strftime('%A')}",
        'season': parse_season(when),
        'type': ride['activityTitle'],
        'earnings': parse_currency_to_float(ride['formattedTotal']),
        'tip': parse_currency_to_float(tip),
        'surge': parse_currency_to_float(surge),
        'duration': duration,
        'distance': distance,
        'pickup address': pickup_address,
        'dropoff address': dropoff_address,
        'status': ride['status'],
        'note': ride['type']
    }
    cleaned_rides.append(ride_clean)

# Let's filter the data to only include completed rides of humans
filtered_rides = [ride for ride in cleaned_rides if ride['status'] == 'COMPLETED' 
                                                 and ride['note'] == 'TRIP' 
                                                 and ride['type'] in ['Comfort', 'UberX', 'UberXL', 'UberX Share', 
                                                                      'UberX Priority', 'Uber Pet', 'Business Comfort']]

# Let's add some calculated columns now to skip the manual processing in a spreadsheet
enriched_rides = []
for ride in filtered_rides:
    ride = ride.copy()
    ride['earnings-surge'] = ride['earnings'] - ride['surge']
    ride['earnings/second'] = ride['earnings'] / ride['duration']
    ride['earnings/mile'] = ride['earnings'] / ride['distance']
    ride['pickup zipcode'] = extract_zipcode(ride['pickup address'])
    ride['dropoff zipcode'] = extract_zipcode(ride['dropoff address'])
    del ride['status']
    del ride['note']
    enriched_rides.append(ride)

In [18]:
rides_df = pd.DataFrame(enriched_rides)
print(rides_df.describe())

# Compute summary statistics for each column, including handling None values
stats = rides_df.describe(include='all')

# Count None values per column
null_count = rides_df.isnull().sum()

# Display the statistics and None count
print(stats)
print("\nCount of None values per column:\n", null_count)

# Additional information on string handling
print("\nAdditional Info:")
for column in rides_df.columns:
    if rides_df[column].dtype == 'object':  # Handling for strings and mixed types
        unique_strings = rides_df[column].dropna().unique()
        print(f"Unique values in column '{column}' (#{len(unique_strings)}): {unique_strings}")

       day of week     earnings          tip        surge     duration  \
count  2839.000000  2839.000000  2839.000000  2839.000000  2839.000000   
mean      3.178584    11.444533     1.491092     0.724283  1041.088411   
std       1.748498     7.081759     2.568790     1.574874   683.613600   
min       0.000000     2.860000     0.000000     0.000000    87.000000   
25%       2.000000     6.700000     0.000000     0.000000   586.500000   
50%       3.000000     9.900000     0.000000     0.000000   869.000000   
75%       5.000000    14.000000     3.000000     1.000000  1304.500000   
max       6.000000   105.860000    28.780000    14.000000  7620.000000   

          distance  earnings-surge  earnings/second  earnings/mile  
count  2839.000000      2839.00000      2839.000000    2839.000000  
mean      4.670236        10.72025         0.012210       3.300924  
std       4.289818         6.58630         0.005468       2.052076  
min       0.200000         2.80000         0.004597      

In [19]:
with open(f"Uber Rides.csv", 'w') as file:
    dw = csv.DictWriter(file, fieldnames=enriched_rides[0].keys())
    dw.writeheader()
    dw.writerows(enriched_rides)

with open(f"Uber All Ride Data.csv", 'w') as file:
    dw = csv.DictWriter(file, fieldnames=cleaned_rides[0].keys())
    dw.writeheader()
    dw.writerows(cleaned_rides)

In [52]:
# Create a defaultdict to store the edge weights (frequencies)
edge_weights = defaultdict(int)

for ride in enriched_rides:
    pickup = ride['pickup zipcode']
    dropoff = ride['dropoff zipcode']
    if pickup is not None and dropoff is not None:
        edge_weights[(pickup, dropoff)] += 1

# Create a directed graph
G = nx.DiGraph()

# Add edges with weights
for (pickup, dropoff), weight in edge_weights.items():
    G.add_edge(pickup, dropoff, weight=weight)

# Find zip codes that are never dropoffs (in-degree zero)
never_dropoff_zipcodes = sorted([node for node, in_degree in G.in_degree() if in_degree == 0])

# Find zip codes that are never pickups (out-degree zero)
never_pickup_zipcodes = sorted([node for node, out_degree in G.out_degree() if out_degree == 0])

# Calculate the in-degrees of all nodes
in_degrees = dict(G.in_degree())
out_degrees = dict(G.out_degree())

# Find the maximum in-degree
max_in_degree = max(in_degrees.values())
max_out_degree = max(out_degrees.values())

# Find zip codes with maximum in-degree (maximum dropoffs)
max_dropoff_zipcodes = [node for node, degree in in_degrees.items() if degree == max_in_degree]

# Find zip codes with maximum out-degree (maximum pickups)
max_pickup_zipcodes = [node for node, degree in out_degrees.items() if degree == max_out_degree]

print("Zip codes I picked up from and never dropped off to:", never_dropoff_zipcodes)
print("Zip codes I dropped off to and never picked up from:", never_pickup_zipcodes)
print("Zip codes that I've picked up from the most:", max_pickup_zipcodes)
print("Zip codes that I've dropped off to the most:", max_dropoff_zipcodes)
print("Total number of zip codes picked up from or dropped off to:", len(G.nodes()))

Zip codes I picked up from and never dropped off to: ['46394', '60005', '60016', '60040', '60101', '60104', '60106', '60164', '60456', '60515', '60534', '60561']
Zip codes I dropped off to and never picked up from: ['06061', '60004', '60007', '60008', '60026', '60043', '60044', '60056', '60085', '60148', '60162', '60171', '60181', '60192', '60301', '60453', '60457', '60459', '60477', '60521', '60525', '60558', '60617', '60633', '60643', '60827']
Zip codes that I've picked up from the most: ['60647']
Zip codes that I've dropped off to the most: ['60611']
Total number of zip codes picked up from or dropped off to: 130
