In [3]:
import pandas as pd

# Load the dataset
flights_df = pd.read_csv('data/flights.csv')

# Display the first few rows to understand the structure
print(flights_df.head())

# Extract the unique cities (starting cities and ending cities)
cities = sorted(set(flights_df['from']).union(set(flights_df['to'])))

# Assign the variables
n_cities = len(cities)  # Number of unique cities
start_cities = flights_df['from'].values  # Starting cities for each flight
end_cities = flights_df['to'].values  # Ending cities for each flight

   travelCode  userCode                from                  to  flightType  \
0           0         0         Recife (PE)  Florianopolis (SC)  firstClass   
1           0         0  Florianopolis (SC)         Recife (PE)  firstClass   
2           1         0       Brasilia (DF)  Florianopolis (SC)  firstClass   
3           1         0  Florianopolis (SC)       Brasilia (DF)  firstClass   
4           2         0        Aracaju (SE)       Salvador (BH)  firstClass   

     price  time  distance       agency        date  
0  1434.38  1.76    676.53  FlyingDrops  09/26/2019  
1  1292.29  1.76    676.53  FlyingDrops  09/30/2019  
2  1487.52  1.66    637.56      CloudFy  10/03/2019  
3  1127.36  1.66    637.56      CloudFy  10/04/2019  
4  1684.05  2.16    830.86      CloudFy  10/10/2019  


In [4]:
import numpy as np

# Create a city-to-index mapping
city_index = {city: idx for idx, city in enumerate(cities)}

# Initialize a zero matrix of size n_cities x n_cities
transition_matrix = np.zeros((n_cities, n_cities))

# Populate the transition matrix with counts
for start, end in zip(start_cities, end_cities):
    start_idx = city_index[start]
    end_idx = city_index[end]
    transition_matrix[start_idx, end_idx] += 1

# Normalize the matrix to get probabilities
transition_matrix /= transition_matrix.sum(axis=1, keepdims=True)

# Display the transition matrix
print("Transition Matrix:\n", transition_matrix)

Transition Matrix:
 [[0.         0.12983559 0.14487965 0.23218891 0.1057651  0.13120567
  0.07570385 0.07839029 0.10203095]
 [0.15702265 0.         0.14675591 0.25273726 0.09704669 0.12417557
  0.06478443 0.06527178 0.09220572]
 [0.15520318 0.12999309 0.         0.23751007 0.1019627  0.12979164
  0.0695004  0.07252216 0.10351675]
 [0.15079296 0.1357189  0.14398869 0.         0.11705079 0.13275294
  0.10131375 0.10119162 0.11719036]
 [0.16544797 0.1255253  0.14889057 0.28193814 0.         0.12161708
  0.03992268 0.0389141  0.07774416]
 [0.16023622 0.1253937  0.14796588 0.24963911 0.09494751 0.
  0.06223753 0.06404199 0.09553806]
 [0.16758846 0.1185846  0.14362177 0.34534642 0.05649718 0.11281594
  0.         0.         0.05554564]
 [0.17060337 0.1174579  0.14733396 0.33910196 0.05413938 0.11412535
  0.         0.         0.05723807]
 [0.1607619  0.12012698 0.15225397 0.28431746 0.07830688 0.12325926
  0.03953439 0.04143915 0.        ]]


In [5]:
# To find the stationary distribution, we need the eigenvector of the transition matrix corresponding to eigenvalue 1
eigvals, eigvecs = np.linalg.eig(transition_matrix.T)

# The eigenvector corresponding to eigenvalue 1 is the stationary distribution
stationary_dist = eigvecs[:, np.isclose(eigvals, 1)].flatten()

# Normalize the stationary distribution
stationary_dist /= stationary_dist.sum()

# Display the stationary distribution
print("Stationary Distribution:", stationary_dist)

Stationary Distribution: [0.13690932 0.1132047  0.12780262 0.21081107 0.08752133 0.11210498
 0.06184532 0.06290826 0.0868924 ]


In [6]:
# Find the index for 'Aracaju (SE)'
aracaju_idx = city_index['Aracaju (SE)']

# Compute P^3 by raising the transition matrix to the power of 3
transition_matrix_3_steps = np.linalg.matrix_power(transition_matrix, 3)

# Get the probability of returning to 'Aracaju (SE)' after 3 steps
prob_return_to_aracaju = transition_matrix_3_steps[aracaju_idx, aracaju_idx]

print(f"Probability of returning to 'Aracaju (SE)' after 3 steps: {prob_return_to_aracaju}")

Probability of returning to 'Aracaju (SE)' after 3 steps: 0.13331717737273135
