In [37]:
import geopandas as gpd
import pandas as pd
from shapely import Point
import matplotlib.pyplot as plt
import numpy as np
import unicodedata

## 0- Read data

In [38]:
df_literacy = pd.read_excel("./data/turkey_literacy.xls",index_col=[0])
df_literacy.head(2)

Unnamed: 0_level_0,city-town,man_literate,man_non_literate,woman_literate,woman_non_literate
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,Adana(Aladağ)-1757,97.94,2.06,85.05,14.95
2014,Adana(Ceyhan)-1219,98.63,1.37,93.18,6.82


### Read dataframe **df_town_locations**.

* **common-operational-dataset.xlsx** is taken from "Turkey - Subnational Administrative Boundaries" dataset  shared by UN HUMANITARIAN DATA EXCHANGE.

* "Turkey - Subnational Administrative Boundaries" dataset is part of the data series : COD - Subnational Administrative Boundaries

*  Page for "Turkey - Subnational Administrative Boundaries":https://data.humdata.org/dataset/cod-ab-tur


In [39]:
df_town_locations = pd.read_excel("common-operational-dataset.xlsx", sheet_name="ADM2", usecols=[1,3,9,10])
df_town_locations =df_town_locations.rename(columns={"adm2_tr":"town", "adm1_tr":"city"})
df_town_locations.head(2)

Unnamed: 0,town,city,latitude,longitude
0,ALADAĞ,ADANA,37.546695,35.394909
1,CEYHAN,ADANA,37.028062,35.818333


## 1-Pre-processing

### 1-1 Pre-processing df_edu

We parse the contents of "city-town" column and create new columns for city, town and town_code.<br>
We also add columns literate and non_literate.

In [40]:
df_literacy["city"] = df_literacy["city-town"].apply(lambda s:s[:s.index("(") ].strip())
df_literacy["town_code"] = df_literacy["city-town"].apply(lambda s:s[s.index("-")+1: ].strip())
df_literacy["town"] = df_literacy["city-town"].apply(lambda s:s[s.index("(")+1:s.index(")") ].strip())
df_literacy["literate"] = df_literacy["man_literate"]+df_literacy["woman_literate"]
df_literacy["non_literate"] = df_literacy["man_non_literate"]+df_literacy["woman_non_literate"]
df_literacy.drop("city-town",inplace=True,axis=1)
df_literacy.head(2)

Unnamed: 0_level_0,man_literate,man_non_literate,woman_literate,woman_non_literate,city,town_code,town,literate,non_literate
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014,97.94,2.06,85.05,14.95,Adana,1757,Aladağ,182.99,17.01
2014,98.63,1.37,93.18,6.82,Adana,1219,Ceyhan,191.81,8.19


### 1-2 Pre-processing df_town_locations

In [41]:
df_town_locations.head(2)

Unnamed: 0,town,city,latitude,longitude
0,ALADAĞ,ADANA,37.546695,35.394909
1,CEYHAN,ADANA,37.028062,35.818333


In pre-processing of df_town_locations we can convert city from uppercase to title from for compatibility with other dataframes.<br>
However, there is a issue to be handled: Turkish "İ" character results in an additional 1 length.<br>
This problem is solved by normalizing string as below.

In [42]:
# this code is necessary to fix issue for 'İ'. By normalizing the string using NFC (Normalization Form Canonical Composition)
# we ensure that the special character 'İ' is represented as a single character, otherwise the length becomes 1 more than the actual length
def normalize_string(s):
    return unicodedata.normalize('NFC', s)
lower_map = {ord(u'I'): u'ı',ord(u'İ'): u'i' }
def lower_fix_letter_i(x):
    return x.translate(lower_map)
def title_fix_letter_i(x):
    if x[0]=="i":
        x="İ"+x[1:]
    return x

df_town_locations["city"] = df_town_locations["city"].apply(lower_fix_letter_i).apply(title_fix_letter_i).apply(str.title) 
df_town_locations["town"] = df_town_locations["town"].apply(lower_fix_letter_i).apply(title_fix_letter_i).apply(str.title) 
df_town_locations.head(2)

Unnamed: 0,town,city,latitude,longitude
0,Aladağ,Adana,37.546695,35.394909
1,Ceyhan,Adana,37.028062,35.818333


## 1-3 Making compatible town names of **df_literacy** and **df_town_locations**

* In df_edu some town names is not up to date since some town names have changed.
<br> For instance "Kazan" has become "Kahramankazan" and "Eyüp" has become "Eyüpsultan".
First we fix these town names.
* In df_town_locations the city name is used instead of merkez(center) in names will be removed.

First let's check town names those are in **df_literacy** but not in df_town_locations

In [43]:
print( set(df_literacy["town"].unique()).difference ( set(df_town_locations["town"].unique())   )   )

{'Merkez', 'Eyüp', 'Bahşili', 'Kazan'}


Let's correct the old names with the current names in df_edu

In [44]:
df_literacy["town"] = df_literacy["town"].replace({"Kazan":"Kahramankazan","Bahşili":"Bahşılı","Eyüp":"Eyüpsultan"})

#### Check **df_literacy \ df_town_locations**

Let's check again (town names those are in **df_edu** but not in gdf_town_locations)

In [45]:
print( set(df_literacy["town"].unique()).difference ( set(df_town_locations["town"].unique())   )   )

{'Merkez'}


In **df_town_locations** city name is used as the town name instead of the word "Merkez"("Center") which implies downtown.<br>
Therefore we **change city names** in the **town** column with the word **"Merkez"**.

In [46]:
df_town_locations.loc[df_town_locations["town"] == df_town_locations["city"] ,"town"] = "Merkez"

Let's check again town names (those are in **df_literacy** but not in df_town_locations).

In [47]:
print( set(df_literacy["town"].unique()).difference ( set(df_town_locations["town"].unique())   )   )

set()


All elements in df_edu are also in df_town_locations.<br>
Now let's also check town names those are in **df_town_locations** but not in **df_literacy**

#### Check **df_town_locations \ df_edu**

In [48]:
print(set(df_town_locations["town"].unique()).difference ( set(df_literacy["town"].unique())    )   )

set()


There is not an additional town name in df_town_locations also. So town names match.<br>
We can see they have same rows (we select a year for df_edu).

In [49]:
df_town_locations.shape, df_literacy.loc[2022].shape

((973, 4), (973, 9))

## 2-Convert the DataFrame to a GeoDataFrame

In [50]:
geometry = [Point(xy) for xy in zip(df_town_locations['longitude'], df_town_locations['latitude'])]
df_town_locations = gpd.GeoDataFrame(df_town_locations, geometry=geometry, crs='EPSG:4326')
df_town_locations.head(2)

Unnamed: 0,town,city,latitude,longitude,geometry
0,Aladağ,Adana,37.546695,35.394909,POINT (35.39491 37.54670)
1,Ceyhan,Adana,37.028062,35.818333,POINT (35.81833 37.02806)


In [51]:
## 3-Select a year for df_literacy and merge to get results

In [52]:
def get_results_for_year(year,df_town_locations,df_literacy):
    df_results = df_town_locations.set_index(["city","town"]).join(df_literacy.loc[year].set_index(["city","town"]))
    return df_results

In [53]:
df_results = get_results_for_year(2022,df_town_locations,df_literacy)
df_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,geometry,man_literate,man_non_literate,woman_literate,woman_non_literate,town_code,literate,non_literate
city,town,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Adana,Aladağ,37.546695,35.394909,POINT (35.39491 37.54670),99.16,0.84,90.43,9.57,1757,189.59,10.41
Adana,Ceyhan,37.028062,35.818333,POINT (35.81833 37.02806),99.14,0.86,95.45,4.55,1219,194.59,5.41
Adana,Çukurova,37.057838,35.270706,POINT (35.27071 37.05784),99.55,0.45,98.53,1.47,2033,198.08,1.92
Adana,Feke,37.815151,35.911644,POINT (35.91164 37.81515),98.83,1.17,93.56,6.44,1329,192.39,7.61
Adana,İmamoğlu,37.258022,35.661951,POINT (35.66195 37.25802),99.25,0.75,96.35,3.65,1806,195.6,4.4


In [58]:

turkey_map = df_results.explore(
    column="non_literate",  # make choropleth based on "quota" column
    #tooltip="quota",  # show "quota" value in tooltip (on hover)
    popup=True,  # show all values in popup (on click)
   # tiles="CartoDB positron",  # use "CartoDB positron" tiles
    cmap="Reds",  # use "Set1" matplotlib colormap
    style_kwds=dict(color="black"),  # use black outline
   tiles= "CartoDB positron"
)
turkey_map

In [None]:
import folium
from folium.plugins import MarkerCluster
marker_cluster = MarkerCluster().add_to(turkey_map)

for uni_name,(point,quota,placements,enrollments,not_registered,normalized_quota) in df_results.iterrows():
    if not point.is_empty:
        location = (point.y, point.x)
        popup= f"University:{uni_name.title()}\nQuota:{quota}\nEnrollments:{enrollments}\nNot_registered:{not_registered}"
        turkey_map.add_child(folium.Circle  ( location= location,radius= normalized_quota, popup=popup,fill=True ))
        folium.Marker(location,popup=popup,).add_to(marker_cluster)