In [1]:
import pandas as pd

import pyspark.sql.functions as f
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.\
        config("spark.jars.repositories", "https://repos.spark-packages.org/").\
        enableHiveSupport().getOrCreate()

In [12]:
immigration = spark.read.parquet("tables/immigration")
country = spark.read.parquet("tables/country")
airport = spark.read.parquet("tables/airport")
city = spark.read.parquet("tables/city")

### 1. Finding your fellow country people
What may be of interest to emigrants is to move to a city which others from your country are moving. To do this, find the top 10 cities with the greatest number of people from your country emigrating to.

In [33]:
(country.
 filter("country_name == 'Ireland'").
 join(immigration, 
      country.country_code == immigration.source_country_code, 
      how = "left").
 join(city,
     immigration.arrival_city_code == city.city_code).
 groupBy("state_code", "city_name").
 count().
 orderBy(f.col("count").desc())
).limit(10).show()

+----------+-----------+-----+
|state_code|  city_name|count|
+----------+-----------+-----+
|        CA|Los Angeles|  932|
|        FL|    Orlando|  877|
|        FL|      Miami|  621|
|        WA|    Seattle|  440|
|        LA|New Orleans|  408|
|        TX|     Dallas|  333|
|        MA|     Boston|  262|
|        GA|    Atlanta|  242|
|        TX|    Houston|  220|
|        TX|     Austin|  115|
+----------+-----------+-----+



### 2. Finding cities with a tolerable climate
You may want to discover cities whose climate is more similar to what you are used to.
In this situation, you many want to uncover popular destinations (>100 emigrants from your home country) which have the most similar mean temperature to your own country.

In [49]:
(country.
 filter("country_name == 'Ireland'").
 join(immigration.select("source_country_code", "arrival_city_code"), 
      country.country_code == immigration.source_country_code, 
      how = "left").
 join(city,
     immigration.arrival_city_code == city.city_code).
 withColumn("temperature_difference", f.abs(country.mean_temperature - city.mean_temperature)).
 groupBy("city_name").
 agg(f.count("city_name").alias("count"), f.mean("temperature_difference").alias("temperature_difference")).
 filter("count >= 100").
 orderBy("temperature_difference")
).limit(10).toPandas()

Unnamed: 0,city_name,count,temperature_difference
0,Denver,114,0.198618
1,Boston,262,1.122063
2,Seattle,440,1.73418
3,Atlanta,242,5.363774
4,Los Angeles,932,6.637486
5,Dallas,333,8.962349
6,Austin,115,10.837133
7,Houston,220,11.099774
8,New Orleans,408,11.119062
9,Orlando,877,13.087362


### 3. Find cities with the correct gender balance
As a solo traveller, women may be concerned with the safety of a candidate destination. Cities which have a large female population currently, or have a large influx of female immigrants may be desirable destinations.
We might consider cities with at least 50% female population, ordered by how popular they currently are female immigrants. 

In [94]:
female_immigrant_proportion = (
    immigration.
    groupBy("arrival_city_code", "gender").
    count().
    withColumn("total_immigrants", f.sum("count").over(Window.partitionBy("arrival_city_code"))).
    withColumn("female_immigrant_proportion", f.col("count") / f.col("total_immigrants")).
    filter("gender == 'F'").
    select("arrival_city_code", "female_immigrant_proportion")
)

(city.
 withColumn("female_proportion", city.female_population / (city.total_population)).
 filter('female_proportion >= 0.5').
 join(female_immigrant_proportion, 
      female_immigrant_proportion.arrival_city_code == city.city_code).
 orderBy(f.col("female_immigrant_proportion").desc()).
 select("city_name", "state_code", "female_proportion", "female_immigrant_proportion")
).limit(10).toPandas()

Unnamed: 0,city_name,state_code,female_proportion,female_immigrant_proportion
0,Newport News,VA,0.515393,1.0
1,Albuquerque,NM,0.511165,1.0
2,Ontario,CA,0.50316,0.636632
3,Sacramento,CA,0.515556,0.628319
4,Providence,RI,0.502857,0.583333
5,Syracuse,NY,0.518134,0.5
6,Oakland,CA,0.513862,0.498012
7,Santa Ana,CA,0.500622,0.48859
8,New Orleans,LA,0.523286,0.467651
9,Pittsburgh,PA,0.508221,0.456522
