# 1. Read the Data

## Printing the first few values from the DataSet

In [33]:
!head /Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/Resources/wfp_market_food_prices.csv

adm0_id,adm0_name,adm1_id,adm1_name,mkt_id,mkt_name,cm_id,cm_name,cur_id,cur_name,pt_id,pt_name,um_id,um_name,mp_month,mp_year,mp_price,mp_commoditysource
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,1,2014,50.0,WFP
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,2,2014,50.0,WFP
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,3,2014,50.0,WFP
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,4,2014,50.0,WFP
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,5,2014,50.0,WFP
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,6,2014,50.0,WFP
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,7,2014,50.0,WFP
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,8,2014,50.0,WFP
1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread,87,AFN,15,Retail,5,KG,9,2014,50.0,WFP


## Checking the encoding of the csv file, resolved the partial encoding error

In [34]:
import chardet

with open("/Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/Resources/wfpvam_foodprices_utf8.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result


{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}

## Reading the csv file and creating a dataframe

In [35]:
#importing necessary dependencies
import pandas as pd

df=pd.read_csv("/Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/Resources/wfpvam_foodprices_utf8.csv")

#displaying the top 5 rows for testing
df.head(5)

Unnamed: 0,adm0_id,adm0_name,adm1_id,adm1_name,mkt_id,mkt_name,cm_id,cm_name,cur_id,cur_name,pt_id,pt_name,um_id,um_name,mp_month,mp_year,mp_price,mp_commoditysource
0,1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread - Retail,0,AFN,15,Retail,5,KG,1,2014,50.0,
1,1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread - Retail,0,AFN,15,Retail,5,KG,2,2014,50.0,
2,1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread - Retail,0,AFN,15,Retail,5,KG,3,2014,50.0,
3,1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread - Retail,0,AFN,15,Retail,5,KG,4,2014,50.0,
4,1,Afghanistan,272,Badakhshan,266,Fayzabad,55,Bread - Retail,0,AFN,15,Retail,5,KG,5,2014,50.0,


## Column Name Descriptions
1. adm0_id: country id
2. adm0_name: country name
3. adm1_id: locality id
4. adm1_name: locality name
5. mkt_id: market id
6. mkt_name: market name
7. cm_id: commodity purchase id
8. cm_name: commodity purchased
9. cur_id: currency id
10. cur_name: name of currency
11. pt_id: market type id
12. pt_name: market type (Retail/Wholesale/Producer/Farm Gate)
13. um_id: measurement id
14. um_name: unit of goods measurement
15. mp_month: month recorded
16. mpyear: year recorded *mpprice: price paid
17. mp_commoditysource: Source supplying price information

#### In order to read the csv stating a specific encoding we can use the encoding parameter

In [36]:
#df=pd.read_csv("/Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/Resources/wfp_market_food_prices_utf8.csv,encoding='ascii')

## Dictionary having the mapping IDs for the market name and location

In [37]:
market_names_dictionary=pd.Series(df.adm1_name.values,index=df.adm1_id).to_dict()
#print(market_names_dictionary)

In [38]:
market2_names_dictionary=pd.Series(df.mkt_name.values,index=df.mkt_id).to_dict()
#print(market2_names_dictionary)

# 2. Cleaning the Data

## Creating another dataframe to resolve Market name column 'object' dtype issue

In [39]:
#considered the whole csv except the two rows that don't 
df_clean=pd.read_csv("/Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/Resources/wfpvam_foodprices_utf8.csv",usecols = ['adm0_id','adm0_name','adm1_id','mkt_id','cm_id','cm_name','cur_id','cur_name','pt_id','pt_name','um_id','um_name','mp_month','mp_year','mp_price','mp_commoditysource'])

df_clean.head(5)

Unnamed: 0,adm0_id,adm0_name,adm1_id,mkt_id,cm_id,cm_name,cur_id,cur_name,pt_id,pt_name,um_id,um_name,mp_month,mp_year,mp_price,mp_commoditysource
0,1,Afghanistan,272,266,55,Bread - Retail,0,AFN,15,Retail,5,KG,1,2014,50.0,
1,1,Afghanistan,272,266,55,Bread - Retail,0,AFN,15,Retail,5,KG,2,2014,50.0,
2,1,Afghanistan,272,266,55,Bread - Retail,0,AFN,15,Retail,5,KG,3,2014,50.0,
3,1,Afghanistan,272,266,55,Bread - Retail,0,AFN,15,Retail,5,KG,4,2014,50.0,
4,1,Afghanistan,272,266,55,Bread - Retail,0,AFN,15,Retail,5,KG,5,2014,50.0,


## Printing the datatypes of the columns

In [40]:
df_clean.dtypes

adm0_id                 int64
adm0_name              object
adm1_id                 int64
mkt_id                  int64
cm_id                   int64
cm_name                object
cur_id                  int64
cur_name               object
pt_id                   int64
pt_name                object
um_id                   int64
um_name                object
mp_month                int64
mp_year                 int64
mp_price              float64
mp_commoditysource    float64
dtype: object

## Convert the csv to hdf file using to_hdf method of Python (not recommended)

In [42]:
df_clean.dtypes

adm0_id                 int64
adm0_name              object
adm1_id                 int64
mkt_id                  int64
cm_id                   int64
cm_name                object
cur_id                  int64
cur_name               object
pt_id                   int64
pt_name                object
um_id                   int64
um_name                object
mp_month                int64
mp_year                 int64
mp_price              float64
mp_commoditysource    float64
dtype: object

In [43]:
df2=pd.read_csv("/Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/Resources/wfpvam_foodprices_utf8.csv")

df2.to_hdf("/Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/1st_Project/Food_prices_hdf5.h5", 'data', mode='w', format='table')


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [44]:
print(pd.read_hdf('/Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/1st_Project/Food_prices_hdf5.h5', 'data').head(5))

   adm0_id    adm0_name  adm1_id   adm1_name  mkt_id  mkt_name  cm_id  \
0        1  Afghanistan      272  Badakhshan     266  Fayzabad     55   
1        1  Afghanistan      272  Badakhshan     266  Fayzabad     55   
2        1  Afghanistan      272  Badakhshan     266  Fayzabad     55   
3        1  Afghanistan      272  Badakhshan     266  Fayzabad     55   
4        1  Afghanistan      272  Badakhshan     266  Fayzabad     55   

          cm_name  cur_id cur_name  pt_id pt_name  um_id um_name  mp_month  \
0  Bread - Retail       0      AFN     15  Retail      5      KG         1   
1  Bread - Retail       0      AFN     15  Retail      5      KG         2   
2  Bread - Retail       0      AFN     15  Retail      5      KG         3   
3  Bread - Retail       0      AFN     15  Retail      5      KG         4   
4  Bread - Retail       0      AFN     15  Retail      5      KG         5   

   mp_year  mp_price  mp_commoditysource  
0     2014      50.0                 NaN  
1     

# Importing Arkouda

In [45]:
import arkouda as ak

In [46]:
ak.connect(connect_url='tcp://MacBook-Pro-5.local:5555 ') #connecting to arkouda server

connected to arkouda server tcp://*:5555


## Conversion to .hdf file using Arkouda ( recommended )

In [52]:
import numpy as np
pdgreen = pd.read_csv('/Users/narenkhatwani/Documents/GitHub/arkouda/Notebooks/Resources/wfpvam_foodprices_utf8.csv')
# transfer columns of DataFrame to arkouda
def ak_create_akdict_from_df(df):
    akdict = {}
    for cname in df.keys():
        if df[cname].dtype.name == 'object':
            akdict[cname] = ak.from_series(df[cname],dtype=np.str)
        else:
            akdict[cname] = ak.from_series(df[cname])

    return akdict


### Passing the dataframe through the converter function to procure hdf file

In [57]:
green_from_pandas = ak_create_akdict_from_df(df)

In [58]:
print(green_from_pandas)

{'adm0_id': array([1 1 1 ... 152 152 152]), 'adm0_name': array(['Afghanistan', 'Afghanistan', 'Afghanistan', ... , 'Malawi', 'Malawi', 'Malawi']), 'adm1_id': array([272 272 272 ... 1890 1890 1890]), 'adm1_name': array(['Badakhshan', 'Badakhshan', 'Badakhshan', ... , 'Southern Region', 'Southern Region', 'Southern Region']), 'mkt_id': array([266 266 266 ... 823 823 823]), 'mkt_name': array(['Fayzabad', 'Fayzabad', 'Fayzabad', ... , 'Thondwe', 'Thondwe', 'Thondwe']), 'cm_id': array([55 55 55 ... 51 51 51]), 'cm_name': array(['Bread - Retail', 'Bread - Retail', 'Bread - Retail', ... , 'Maize - Retail', 'Maize - Retail', 'Maize - Retail']), 'cur_id': array([0 0 0 ... 0 0 0]), 'cur_name': array(['AFN', 'AFN', 'AFN', ... , 'MWK', 'MWK', 'MWK']), 'pt_id': array([15 15 15 ... 15 15 15]), 'pt_name': array(['Retail', 'Retail', 'Retail', ... , 'Retail', 'Retail', 'Retail']), 'um_id': array([5 5 5 ... 5 5 5]), 'um_name': array(['KG', 'KG', 'KG', ... , 'KG', 'KG', 'KG']), 'mp_month': array([1 2 3 .

In [62]:
data= green_from_pandas

# 3. Analysis part

In [60]:
def describe(x):
    fmt = 'mean: {}\nstd : {}\nmin : {}\nmax : {}'
    if x.dtype == ak.float64:
        fmt = fmt.format(*['{:.2f}' for _ in range(4)])
    print(fmt.format(x.mean(), x.std(), x.min(), x.max()))

In [63]:
describe(data['mp_price'])

mean: 6282.79
std : 64147.73
min : 0.00
max : 21777780.00


# 4.

## Arkouda Configuration

In [49]:
cfg=ak.get_config()

In [50]:
cfg

{'arkoudaVersion': 'v2021.08.20+18.g7a657ad',
 'ZMQVersion': '4.3.2',
 'HDF5Version': '1.12.1',
 'serverHostname': 'MacBook-Pro-5.local',
 'ServerPort': 5555,
 'numLocales': 1,
 'numPUs': 4,
 'maxTaskPar': 4,
 'physicalMemory': 8589934592,
 'distributionType': 'domain(1,int(64),false)',
 'LocaleConfigs': [{'id': 0,
   'name': 'MacBook-Pro-5.local',
   'numPUs': 4,
   'maxTaskPar': 4,
   'physicalMemory': 8589934592}],
 'authenticate': False,
 'logLevel': 'INFO',
 'byteorder': 'little'}