In [29]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("Data/Data.csv")
df = df[["country", "commodity", "loss_percentage", "activity", "food_supply_stage"]]
df

Unnamed: 0,country,commodity,loss_percentage,activity,food_supply_stage
0,Myanmar,"Groundnuts, excluding shelled",5.22,,Whole supply chain
1,Myanmar,"Groundnuts, excluding shelled",5.43,,Whole supply chain
2,Myanmar,"Groundnuts, excluding shelled",5.61,,Whole supply chain
3,Myanmar,"Groundnuts, excluding shelled",5.40,,Whole supply chain
4,Myanmar,"Groundnuts, excluding shelled",5.00,,Whole supply chain
...,...,...,...,...,...
27768,Zambia,Millet,1.27,Storage,Farm
27769,Zambia,Millet,2.50,Winnowing,Farm
27770,Zambia,Millet,1.00,Transportation,Transport
27771,Zambia,Millet,2.38,Storage,Storage


In [3]:
# df[df.food_supply_stage=="Whole supply chain"]

Missing values mostly belong to the rows whose food_supply_stage refer to 'Whole supply chain'.

In [4]:
df=df.dropna(axis=0).reset_index().drop(["index"],axis=1)

In [5]:
df

Unnamed: 0,country,commodity,loss_percentage,activity,food_supply_stage
0,Burundi,Wheat,3.50,"Shelling, Threshing",Farm
1,Burundi,Wheat,4.87,Storage,Farm
2,Burundi,Wheat,2.50,Transportation,Farm
3,Burundi,Wheat,4.43,"Drying, Harvesting",Harvest
4,Burundi,Maize (corn),4.00,Drying,Farm
...,...,...,...,...,...
20865,Zambia,Millet,1.27,Storage,Farm
20866,Zambia,Millet,2.50,Winnowing,Farm
20867,Zambia,Millet,1.00,Transportation,Transport
20868,Zambia,Millet,2.38,Storage,Storage


In [6]:
X=df[["country","commodity","food_supply_stage","activity"]]
Y=df.loss_percentage

In [7]:
splitted=X['activity'].str.split(',', expand=True)

In [8]:
X=pd.concat([X,splitted],axis=1)

In [9]:
X.drop(["activity"],axis=1,inplace=True)

In [10]:
X

Unnamed: 0,country,commodity,food_supply_stage,0,1,2,3,4,5,6,7,8,9,10
0,Burundi,Wheat,Farm,Shelling,Threshing,,,,,,,,,
1,Burundi,Wheat,Farm,Storage,,,,,,,,,,
2,Burundi,Wheat,Farm,Transportation,,,,,,,,,,
3,Burundi,Wheat,Harvest,Drying,Harvesting,,,,,,,,,
4,Burundi,Maize (corn),Farm,Drying,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20865,Zambia,Millet,Farm,Storage,,,,,,,,,,
20866,Zambia,Millet,Farm,Winnowing,,,,,,,,,,
20867,Zambia,Millet,Transport,Transportation,,,,,,,,,,
20868,Zambia,Millet,Storage,Storage,,,,,,,,,,


In [11]:
listt=[]
for col in range(0,11):
    listt.append(set(X[col].unique()))

In [12]:
listtt=[]
for i in range(0,11):
    for item in listt[i]:
        if item==None:
            continue
        listtt.append(item.strip())
listtt=list(set(listtt))
listtt.sort()

In [13]:
listtt

['Assembling',
 'Bagging',
 'Blanching',
 'Bundling',
 'Cleaning',
 'Collection',
 'Consumption',
 'Curing',
 'Dewatering',
 'Distribution',
 'Drying',
 'Exporting',
 'Farm',
 'Field',
 'Freezing',
 'Grading',
 'Grating',
 'Handling',
 'Harvesting',
 'Layering',
 'Lifting',
 'Loading',
 'Manufacturing',
 'Marketing',
 'Milling',
 'Packaging',
 'Parboiling',
 'Peeling',
 'Piling',
 'Preservation',
 'Processing',
 'Retailing',
 'Ripening',
 'Roasting',
 'Shelling',
 'Sifting',
 'Sorting',
 'Stacking',
 'Storage',
 'Threshing',
 'Trading',
 'Transportation',
 'Unloading',
 'Washing',
 'Wholesale',
 'Winnowing']

In [14]:
X.shape

(20870, 14)

In [15]:
zeros=np.zeros([X.shape[0],len(listtt)],int)
activity_cols=pd.DataFrame(zeros,columns=listtt)
activity_cols

Unnamed: 0,Assembling,Bagging,Blanching,Bundling,Cleaning,Collection,Consumption,Curing,Dewatering,Distribution,...,Sorting,Stacking,Storage,Threshing,Trading,Transportation,Unloading,Washing,Wholesale,Winnowing
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20866,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20867,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20868,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X=X.applymap(lambda x: x.split()[0] if x!=None else x )
X

Unnamed: 0,country,commodity,food_supply_stage,0,1,2,3,4,5,6,7,8,9,10
0,Burundi,Wheat,Farm,Shelling,Threshing,,,,,,,,,
1,Burundi,Wheat,Farm,Storage,,,,,,,,,,
2,Burundi,Wheat,Farm,Transportation,,,,,,,,,,
3,Burundi,Wheat,Harvest,Drying,Harvesting,,,,,,,,,
4,Burundi,Maize,Farm,Drying,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20865,Zambia,Millet,Farm,Storage,,,,,,,,,,
20866,Zambia,Millet,Farm,Winnowing,,,,,,,,,,
20867,Zambia,Millet,Transport,Transportation,,,,,,,,,,
20868,Zambia,Millet,Storage,Storage,,,,,,,,,,


In [17]:
X

Unnamed: 0,country,commodity,food_supply_stage,0,1,2,3,4,5,6,7,8,9,10
0,Burundi,Wheat,Farm,Shelling,Threshing,,,,,,,,,
1,Burundi,Wheat,Farm,Storage,,,,,,,,,,
2,Burundi,Wheat,Farm,Transportation,,,,,,,,,,
3,Burundi,Wheat,Harvest,Drying,Harvesting,,,,,,,,,
4,Burundi,Maize,Farm,Drying,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20865,Zambia,Millet,Farm,Storage,,,,,,,,,,
20866,Zambia,Millet,Farm,Winnowing,,,,,,,,,,
20867,Zambia,Millet,Transport,Transportation,,,,,,,,,,
20868,Zambia,Millet,Storage,Storage,,,,,,,,,,


In [18]:
dummies_0=pd.get_dummies(X[0],columns=listtt)
dummies_1=pd.get_dummies(X[1],columns=listtt)
dummies_2=pd.get_dummies(X[2],columns=listtt)
dummies_3=pd.get_dummies(X[3],columns=listtt)
dummies_4=pd.get_dummies(X[4],columns=listtt)
dummies_5=pd.get_dummies(X[5],columns=listtt)
dummies_6=pd.get_dummies(X[6],columns=listtt)
dummies_7=pd.get_dummies(X[7],columns=listtt)
dummies_8=pd.get_dummies(X[8],columns=listtt)
dummies_9=pd.get_dummies(X[9],columns=listtt)
dummies_10=pd.get_dummies(X[10],columns=listtt)


In [19]:
arr=[dummies_0,dummies_1,dummies_2,dummies_3,dummies_4,dummies_5,dummies_6,dummies_7,dummies_8,dummies_9,dummies_10]
for i in arr:
    for col in i.columns:
        activity_cols[col]=activity_cols[col].where(activity_cols[col]>i[col],i[col])

In [20]:
X=pd.concat([X,activity_cols],axis=1)

In [21]:
X.drop([0,1,2,3,4,5,6,7,8,9,10],axis=1,inplace=True)

In [22]:
X

Unnamed: 0,country,commodity,food_supply_stage,Assembling,Bagging,Blanching,Bundling,Cleaning,Collection,Consumption,...,Sorting,Stacking,Storage,Threshing,Trading,Transportation,Unloading,Washing,Wholesale,Winnowing
0,Burundi,Wheat,Farm,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,Burundi,Wheat,Farm,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Burundi,Wheat,Farm,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,Burundi,Wheat,Harvest,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Burundi,Maize,Farm,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20865,Zambia,Millet,Farm,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
20866,Zambia,Millet,Farm,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
20867,Zambia,Millet,Transport,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
20868,Zambia,Millet,Storage,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [23]:
dummies=pd.get_dummies((X[["country","commodity","food_supply_stage"]]))
X=pd.concat([X,dummies],axis=1)
X.drop(["country","commodity","food_supply_stage"],axis=1,inplace=True)

In [24]:
X

Unnamed: 0,Assembling,Bagging,Blanching,Bundling,Cleaning,Collection,Consumption,Curing,Dewatering,Distribution,...,food_supply_stage_Market,food_supply_stage_Packing,food_supply_stage_Post-harvest,food_supply_stage_Processing,food_supply_stage_Retail,food_supply_stage_Storage,food_supply_stage_Trader,food_supply_stage_Transport,food_supply_stage_Whole,food_supply_stage_Wholesale
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20866,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20867,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
20868,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [47]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=30)

Regression Model

In [49]:
reg = LinearRegression().fit(X_train, Y_train)
reg.score(X_test, Y_test)

-1.7448521429378021e+19

In [None]:
lambdaRange = range(1,100)
bestRsq = 0.0                                                # To contain the best R^2.
bestLambda = 0.0                                             # To contain the best lambda.
for aLambda in lambdaRange:
    lasso = Lasso(aLambda)
    lasso.fit(X_train,Y_train)                               # Train with the training set.
    rsq = lasso.score(X_test,Y_test)                         # Calculate R^2 with the testing set.
    if (rsq > bestRsq):                                      # If R^2 is better, replace bestRsq.
        bestRsq = rsq
        bestLambda = aLambda
        print("Lambda = ", bestLambda, " and  R^2 = ", np.round(rsq,3))
lasso = Lasso(bestLambda)                                    # Final run with the bestLambda.
lasso.fit(X,Y)

In [50]:
from sklearn.svm import SVR

In [54]:
X_train

Unnamed: 0,Assembling,Bagging,Blanching,Bundling,Cleaning,Collection,Consumption,Curing,Dewatering,Distribution,...,food_supply_stage_Market,food_supply_stage_Packing,food_supply_stage_Post-harvest,food_supply_stage_Processing,food_supply_stage_Retail,food_supply_stage_Storage,food_supply_stage_Trader,food_supply_stage_Transport,food_supply_stage_Whole,food_supply_stage_Wholesale
17255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
15277,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4517,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [57]:
regr = SVR(kernel="linear",C=1.0, epsilon=0.5, verbose=2)


In [58]:
regr.fit(X_train, Y_train)

[LibSVM]...............................................
*..........
*.............
*
optimization finished, #iter = 69387
obj = -11218.662266, rho = -5.443036
nSV = 7701, nBSV = 7247


In [None]:
regr.score(X_test, Y_test)

In [61]:
import statsmodels.api as sm

In [68]:
lm=sm.OLS(Y,X)

In [69]:
model=lm.fit()