In [23]:
from pyspark.sql.functions import concat, col, lit, split, regexp_replace, desc, asc
#Read in datasets
csv1 = '2012_semi_processed.csv'
csv2 = '2002.csv'
csv3 = '2007.csv'
_2012 = spark.read.load(csv1, format='csv',header=True, inferSchema = True)
_2002 = spark.read.load(csv2, format='csv',header=True, inferSchema = True)
_2007 = spark.read.load(csv3, format='csv',header=True, inferSchema = True)

In [3]:
_2002.show()

+---+---+-----+-----+-----+-------+---------+-----+-----+
|_c0|EMP|ESTAB|EMP_F|OPTAX|ESTAB_F|NAICS2002|state|place|
+---+---+-----+-----+-----+-------+---------+-----+-----+
|  0|  0|   26|    f|   99|   null|    31-33|    1| 2116|
|  1|  0|    6|    b|   00|   null|       42|    1| 2116|
|  2|  0|    6|    b|   10|   null|       42|    1| 2116|
|  3|  0|    5|    a|   00|   null|      423|    1| 2116|
|  4|  0|    5|    a|   10|   null|      423|    1| 2116|
|  5|  0|    1|    a|   00|   null|      424|    1| 2116|
|  6|  0|    1|    a|   10|   null|      424|    1| 2116|
|  7|  0|    1|    a|   00|   null|     4247|    1| 2116|
|  8|  0|    1|    a|   00|   null|    42471|    1| 2116|
|  9|  0|    1|    a|   00|   null|   424710|    1| 2116|
| 10|760|   69| null|   99|   null|    44-45|    1| 2116|
| 11|131|   13| null|   99|   null|      441|    1| 2116|
| 12|  0|    4|    b|   99|   null|    44112|    1| 2116|
| 13|  0|    4|    b|   99|   null|   441120|    1| 2116|
| 14| 16|    3

In [4]:
#get the naics titles and codes for joining
naics_info = _2012.select('NAICS2012_TTL','NAICS2012').dropDuplicates()

In [5]:
#get the city name, code, and unique city_state_code
city_info = _2012.select('GEO_TTL','state','place','city_state_code').dropDuplicates()

In [6]:
#rename columns for city info into 2012 data (for no errors when joining and writing files)
city_info = city_info.selectExpr("GEO_TTL","state as state_2012", "place as place_2012","city_state_code")

In [7]:
city_info.show()

+--------------------+----------+----------+--------------------+
|             GEO_TTL|state_2012|place_2012|           fuck_this|
+--------------------+----------+----------+--------------------+
|      Gurnee village|        17|     32018|                t_IL|
|South Barrington ...|        17|     70564|                t_IL|
|   Hampshire village|        17|     32525|                t_IL|
|           Olney CDP|        24|     58900|           Olney _MD|
|     Sweetwater city|        48|     71540|      Sweetwater _TX|
|     Scottsdale city|         4|     65000|      Scottsdale _AZ|
|Security-Widefiel...|         8|     68847|Security-Widefiel...|
|Cherry Hills Vill...|         8|     13845|Cherry Hills Vill...|
|      Crestview city|        12|     15475|       Crestview _FL|
|   Jupiter Farms CDP|        12|     35890|   Jupiter Farms _FL|
|Hutchinson Island...|        12|     32993|Hutchinson Island...|
|         Buford city|        13|     11784|          Buford _GA|
|   Watkin

In [8]:
#join the naics codes and titles with the 2002 data
naics_updates = _2002.join(naics_info, _2002.NAICS2002 == naics_info.NAICS2012)
naics_updates.show()

+-----+---+-----+-----+-----+-------+---------+-----+-----+-------------+---------+
|  _c0|EMP|ESTAB|EMP_F|OPTAX|ESTAB_F|NAICS2002|state|place|NAICS2012_TTL|NAICS2012|
+-----+---+-----+-----+-----+-------+---------+-----+-----+-------------+---------+
|  964|  0|    5|    c|    A|   null|   712110|    1| 7000|      Museums|   712110|
|  965|  0|    5|    c|    N|   null|   712110|    1| 7000|      Museums|   712110|
| 3695|  0|    4|    b|    A|   null|   712110|    1|37000|      Museums|   712110|
| 3696|  0|    3|    b|    N|   null|   712110|    1|37000|      Museums|   712110|
|18952|  0|    5|    b|    A|   null|   712110|    1|50000|      Museums|   712110|
|18953|  0|    5|    b|    N|   null|   712110|    1|50000|      Museums|   712110|
|20454|  0|    3|    b|    A|   null|   712110|    1|51000|      Museums|   712110|
|20455|  0|    3|    b|    N|   null|   712110|    1|51000|      Museums|   712110|
|25704|  0|    4|    b|    A|   null|   712110|    2| 3000|      Museums|   

In [9]:
#Join 2002 data with updated naics titles with updated city names
job_updates = naics_updates.join(city_info, (naics_updates.state == city_info.state_2012) & (naics_updates.place == city_info.place_2012), how = 'full')
job_updates.show()

+-----+----+-----+-----+-----+-------+---------+-----+-----+--------------------+---------+-------------------+----------+----------+------------------+
|  _c0| EMP|ESTAB|EMP_F|OPTAX|ESTAB_F|NAICS2002|state|place|       NAICS2012_TTL|NAICS2012|            GEO_TTL|state_2012|place_2012|         fuck_this|
+-----+----+-----+-----+-----+-------+---------+-----+-----+--------------------+---------+-------------------+----------+----------+------------------+
| null|null| null| null| null|   null|     null| null| null|                null|     null|  Meridianville CDP|         1|     48112| Meridianville _AL|
|14925|   0|    2|    e|    A|   null|      623|    1|61008|Nursing and resid...|      623|Pleasant Grove city|         1|     61008|Pleasant Grove _AL|
|14926|   0|    2|    e|    T|   null|      623|    1|61008|Nursing and resid...|      623|Pleasant Grove city|         1|     61008|Pleasant Grove _AL|
|14931|  39|    6| null|    A|   null|       81|    1|61008|Other services (e...| 

In [24]:
##Join 2002 data with updated city info
updated_2007 = _2007.join(city_info, (_2007.state == city_info.state_2012) & (_2007.place == city_info.place_2012), how = 'full')
updated_2007.show()

+-----+---------------+----+-----+-----+--------------------+---------+-----+-----+-------------------+----------+----------+------------------+
|  _c0|GEOTYPE_MEANING| EMP|ESTAB|OPTAX|   NAICS2007_MEANING|NAICS2007|state|place|            GEO_TTL|state_2012|place_2012|         fuck_this|
+-----+---------------+----+-----+-----+--------------------+---------+-----+-----+-------------------+----------+----------+------------------+
| null|           null|null| null| null|                null|     null| null| null|  Meridianville CDP|         1|     48112| Meridianville _AL|
|54915|          Place|   0|    3|   10|     Wholesale trade|       42|    1|61008|Pleasant Grove city|         1|     61008|Pleasant Grove _AL|
|54955|          Place|   0|    3|   10|Merchant wholesal...|      423|    1|61008|Pleasant Grove city|         1|     61008|Pleasant Grove _AL|
|55263|          Place| 150|   10|   99|        Retail trade|    44-45|    1|61008|Pleasant Grove city|         1|     61008|Pleas

In [18]:
#make a pandas dataframe from the 2002 data
_2002_updated = job_updates.toPandas()

In [25]:
_2007_updated_pd = updated_2007.toPandas()

In [26]:
_2007_updated_pd.to_csv('_2007_updated_pd.csv')

In [22]:
_2002_updated.to_csv('_2002_updated_pd.csv')