In [1]:
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np 
from urllib.request import urlopen as uopen
from urllib.request import Request as request
from functools import reduce
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# from sklearn.matrix 


---
---
# some initialisations :
---
---

In [2]:
colors = ['Blanc', 'Noir', 'Or', 'Rose',
       'Mesh Rose', 'Black', 'Argent', 'Beige',
       'Kaki','Vert','Bleu', 'Jaune',
       'Rouge', 'Bleu','Gris', 'Bleu Nuit', 'Bleu Pastel', 'Camel',
       'Gold', 'Grenat','Gris Clair','Mauve', 'Multicolore','Bronze',
       'Noir/N', 'Orange','PINK', 'Rose Clair', ' Rose clair',
       'Violet',' Silver', 'Blanche','Marron','Verte']
colors = list(map(lambda x: x.lower().strip(), colors))
colors = np.unique(colors) 

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 '}
df = pd.DataFrame()


---
---
# extarction des donnees:
---
---

In [3]:
def extract(articles, df, gender):
    for article in articles:
        # extract
        full_name = article.find_all("h3", {"class": "name"})[0].text.lower()
        price = article.find_all("div", {"class": "prc"})[0].text
        reduction = article.find_all("div", {"class": "tag _dsct _sm"})
        stars = article.find_all("div", {"class": "stars _s"})

        # reduction / stars TRT
        reduction = reduction[0].text[:-1] if reduction else 0
        stars = stars[0].text.split()[0] if stars else 0

        a= {
        "gender" : gender,
        "description" : full_name,
        "price" : price,
        "stars" : float(stars)/10,
        "reduction": float(reduction)/100
        }
        df = df.append(a, ignore_index=True)

    return df

In [4]:

for j in range(2):
    gender = "homme" if j  else "femme"
    for i in range(1, 8):
        url = 'https://www.jumia.dz/catalog/?q=basket+'+gender+'&page='+str(i)+'#catalog-listing'
        print(url)
        req = request(url=url, headers=headers) 
        client = uopen(req)
        page_html = client.read()
        client.close()
        parsed = soup(page_html, "html.parser") 
        articles = parsed.find_all("article",{ 'class' :"prd _fb col c-prd"})
        df = extract(articles, df, gender)

https://www.jumia.dz/catalog/?q=basket+femme&page=1#catalog-listing
https://www.jumia.dz/catalog/?q=basket+femme&page=2#catalog-listing
https://www.jumia.dz/catalog/?q=basket+femme&page=3#catalog-listing
https://www.jumia.dz/catalog/?q=basket+femme&page=4#catalog-listing
https://www.jumia.dz/catalog/?q=basket+femme&page=5#catalog-listing
https://www.jumia.dz/catalog/?q=basket+femme&page=6#catalog-listing
https://www.jumia.dz/catalog/?q=basket+femme&page=7#catalog-listing


In [None]:
df.to_csv("data.csv", index = False)

---
---
#    pretraitement des donnees:
---
---

In [7]:
def brand_trt(full_name):
    if "bask" in full_name:
        return full_name[:full_name.index("bask")].strip()
    elif " chaus" in full_name:
        return full_name[:full_name.index(" chaus")].strip()
    else :
        return "None"

def brand_trt2(x):

    if x == "noennamenull" or x =="fashion" or x =="None":
        return "no name"
    if x == "skechers":
        return "sketchers"
    if x == "asics performance":
        return 'asics'
    if x == "hummel ensemble homme - core" or x == 'hummel core':
        return "hummel"
    else :
        return x

In [8]:
df["brand"] = df["description"].apply(brand_trt)
df = df[df["brand"] != ""]
df["new_brand"] = df["brand"].apply(brand_trt2)

# del df["brand"]

In [9]:
def price_trt(price):
    price = price.replace(",", "").replace("DA", "")
    if "-" in price:
        price = price.split("-")
        price = (float(price[0])+ float(price[1]))/2
    return float(price) /1000

In [10]:
df["avg_price"] = df["price"].apply(price_trt)
# del df["price"]

In [11]:
def color_trt(full_name):
    color  = "None"
    list_color = list(filter(lambda x: x in colors, full_name.split()))
    if list_color:
        color = reduce(lambda x, a: a+ " "+ x, list_color)
    return color

In [12]:
df["color"] = df["description"].apply(color_trt)

In [13]:
def uni_trt(color):
    return 1 if len(color.split()) == 1 else 0
        
def multi_trt(color):
    return 1 if len(color.split()) != 1 else 0

In [15]:
df["uni"] = df["color"].apply(uni_trt)
df["multi"] = df["color"].apply(multi_trt)

---
---
# Analyse des donnees:
---
---


In [None]:
# number of shoes in each brand
brands = list(df["new_brand"].value_counts().index)
values = df["new_brand"].value_counts().values
plt.figure(figsize=(9, 9))
plt.barh(brands, values)

In [None]:
# Avrage price for each brand
brands = np.unique(df["new_brand"].values)
avg_prices = []
for b in brands:
    x = df[df["new_brand"]== b]["avg_price"].values.mean()
    avg_prices.append(x)
plt.figure(figsize=(9, 9))
plt.barh(brands, avg_prices)

---
---
# convert categorical data to numerical data:
---
---

In [18]:
new_df = df[ ["gender","reduction"	,"stars","new_brand","avg_price","uni","multi"] ]

In [19]:
new_df = pd.get_dummies(new_df)

In [30]:
new_df

Unnamed: 0,reduction,stars,avg_price,uni,multi,gender_femme,gender_homme,new_brand_adidas,new_brand_asics,new_brand_aveda,...,new_brand_sketchers,new_brand_sport,new_brand_stradivarius,new_brand_superdry,new_brand_tony p,new_brand_under armour,new_brand_us polo,new_brand_vo7,new_brand_yl,new_brand_zeta
0,0.50,0.40,5.10,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.39,0.43,3.60,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.39,0.50,3.60,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.38,0.00,3.20,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.23,0.42,2.30,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,0.28,0.00,14.40,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
556,0.14,0.43,7.70,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
557,0.34,0.50,6.90,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
558,0.27,0.00,3.50,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---
---
# buid the model:
---
---

In [31]:
y = new_df["avg_price"]
X = new_df.drop("avg_price", axis = 1)


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [33]:
multi_reg =  LinearRegression()
multi_reg.fit(X_train, y_train)
multi_pred = multi_reg.predict(X_test)
print(multi_reg.score(X_test, y_test))


-1.454486331030195e+22
