# CV Utils - Image Detection example

###### Fashion dataset (Kaggle)  https://www.kaggle.com/datafiniti/womens-shoes-prices

### 0 - Setup

###### Import pckgs

In [4]:
from cv_utils import *
import warnings
warnings.filterwarnings("ignore")

###### Import data

In [33]:
# Reading the csv with price and image url
import csv

with open('data.csv', 'r') as file:
    table = csv.reader(file, delimiter=',')
    
    ## skip headers
    next(table, None)
    
    ## get data (total 10,000 rows)
    #data = [{"id":i, "url":row[10], "price":row[16]} for i,row in enumerate(table)]
    
    ## some items are the same, I'll keep the unique values (653 rows)
    lst_id, data = [], []
    for row in table:
        if row[0] not in lst_id:
            lst_id.append(row[0])
            data.append({"url":row[10], "price":row[16]})
        else:
            next
    
    ## insert a custom id
    for i,dic in enumerate(data):
        dic["id"]=i
        
print("len:", len(data))
data[0]

len: 653


{'url': 'https://i5.walmartimages.com/asr/861ca6cf-fa55-4a48-904d-b764d7c00f0c_1.1a2bb39923e1486d05bdafe37ad832e3.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,https://i5.walmartimages.com/asr/efe9ba1e-daed-4534-9e2e-11804bbb30f1_1.62e3e7f0268f641323a245d5caebdd6d.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,https://i5.walmartimages.com/asr/0c717815-228e-4c9b-a8fc-d033576461c9_1.f08402e0a5165746e133ddeb589c73e0.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,https://i5.walmartimages.com/asr/f46703c8-2cdb-4bf2-a3ea-819f24aab134_1.df725b76ca0112d64bdf566ad97760a9.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,https://i5.walmartimages.com/asr/f652f354-a1fb-47ac-b507-7f97eb216b14_1.39e78b87e2328421803115869ee8b950.jpeg?odnHeight=450&odnWidth=450&odnBg=FFFFFF,http://ak1.ostkcdn.com/images/products/84/146/P16141204.jpg,http://s1.shoes.com/images/br021/womens-naturalizer-danya-soft-silver-crosshatch-shiny-377672_366_tp.jpg,http://s3.shoes.com/images/br021/womens-naturalizer-danya-soft-silver-crossh

In [None]:
# Scraping the imgaes from the web and downloading on file system
import requests
import os

## create the folder if doesn't exist 
dirpath = "imgs/"
if not os.path.exists(dirpath):
    os.makedirs(dirpath)
    
## api get each url and save the image into file
for dic in data:
    try:
        res = requests.get(dic["url"])
        if res.status_code == 200:
            file_name = str(dic["id"])+'.jpg'
            file = open(dirpath+file_name,'wb')
            file.write(res.content)
            file.close()
    except:
        print(dic["id"], "| res:", res.status_code)
        next

383 | res: 200
384 | res: 200


In [None]:
img = utils_load_img(dirpath, "0.jpg", figsize=(7,5))

### 1 - Image Analysis

###### Look for Metadata
- size
- colors

###### Target Variable

###### Size

###### Colors

###### Summary

### 2 - Preprocessing

###### Partitioning

### 3 - Baseline (Bag of Words + Machine Learning)

###### Features Engineering

In [None]:
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [None]:
dic_bow = fit_bow(corpus=dtf_train["text_clean"], vectorizer=vectorizer, top=10, figsize=(10,3))

X_train, vectorizer, dic_vocabulary, lst_text2tokens = dic_bow["X"], dic_bow["vectorizer"], dic_bow["dic_vocabulary"], dic_bow["lst_text2tokens"]

In [None]:
print("from: ", dtf_train["text_clean"].iloc[0], "| len:", len(dtf_train["text_clean"].iloc[0].split()))
print("to: ", lst_text2tokens[0], "| len:", len(lst_text2tokens[0]))
print("check: ", dtf_train["text_clean"].iloc[0].split()[0], " -- idx in vocabolary -->", 
      dic_vocabulary[dtf_train["text_clean"].iloc[0].split()[0]])

###### Features Selection

In [None]:
dic_features_selection = features_selection(X_train, y=dtf_train["y"], vectorizer_fitted=vectorizer, top=None)

In [None]:
dtf_X_train = sparse2dtf(X_train, dic_vocabulary, lst_words=dic_features_selection["ALL"])

In [None]:
dtf_train = pd.concat([dtf_train, dtf_X_train.set_index(dtf_train.index)], axis=1)

print(dtf_train.shape)
dtf_train.head(3)

###### Preprocess Test

In [None]:
X_test = vectorizer.transform(dtf_test["text_clean"])
dtf_X_test = sparse2dtf(X_test, dic_vocabulary, lst_words=dic_features_selection["ALL"])

In [None]:
dtf_test = pd.concat([dtf_test, dtf_X_test.set_index(dtf_test.index)], axis=1)

print(dtf_test.shape)
dtf_test.head(3)

###### Train

In [None]:
X_train = dtf_train.drop(["y","text","text_clean","tags","lang"], axis=1).values
y_train = dtf_train["y"].values
X_test = dtf_test.drop(["y","text","text_clean","tags","lang"], axis=1).values
y_test = dtf_test["y"].values

In [None]:
classifier = naive_bayes.MultinomialNB()

In [None]:
dic_model = ml_text_classif(X_train, y_train, X_test, y_test, preprocessing=False, vectorizer=None, classifier=classifier)
predicted_prob, predicted = dic_model["predicted_prob"], dic_model["predicted"]

###### Evaluate

In [None]:
evaluate_multi_classif(y_test, predicted, predicted_prob, figsize=(15,5))

### 4 - Model Desing & Testing (pre-trained Embeddings + Deep Learning)

###### I already have:
- dtf_train --> x=text_clean, y must be encoded
- NB! I need a new vectorizer cuz the one in Baseline has ngrams=(1,2)

In [None]:
dtf_train, dic_y_mapping = encode_variable(dtf_train, "y")
print(dic_y_mapping)

###### Features Engineering

In [None]:
# create input for lstm (sequences of tokens)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=None, ngram_range=(1,1))

In [None]:
X_train, dic_vocabulary = text2seq(corpus=dtf_train["text_clean"], vectorizer=vectorizer)

In [None]:
print("from: ", dtf_train["text_clean"].iloc[0], "| len:", len(dtf_train["text_clean"].iloc[0].split()))
print("to: ", X_train[0], "| len:", len(X_train[0]))

In [None]:
# create weights for lstm (embeddings of tokens)
nlp = gensim_api.load("glove-wiki-gigaword-300")

In [None]:
embeddings = vocabulary_embeddings(dic_vocabulary, nlp, dim_space=300)

In [None]:
word = dtf_train["text_clean"].iloc[0].split()[0]
print("word:", word)
print("dic[word]:", dic_vocabulary[word], "|idx")
print("embeddings[idx]:", embeddings[dic_vocabulary[word]].shape, "|vector")

In [None]:
plot_w2v(nlp, plot_type="2d", word=word, top=20, figsize=(10,5))

###### Preprocess Test

In [None]:
X_test, _ = text2seq(corpus=dtf_test["text_clean"], vectorizer=vectorizer, vocabulary=dic_vocabulary, 
                     maxlen=X_train.shape[1])

###### Train

In [None]:
y_train = dtf_train["y"].values
y_test = dtf_test["y"].values

In [None]:
# this takes a while
dic_lstm = dl_text_classif(dic_y_mapping, embeddings, X_train, y_train, X_test, y_test, 
                           model=None, epochs=10, batch_size=256)

predicted_prob, predicted = dic_lstm["predicted_prob"], dic_lstm["predicted"]

###### Evaluate

In [None]:
evaluate_multi_classif(y_test, predicted, predicted_prob, figsize=(15,5))

### 5 - Model Desing & Testing (training Embeddings from scratch + Deep Learning)

###### Plan:
- I will just create a new nlp model using all data
- create new Embeddings array with the new nlp model and the same vocabulary
- the rest of the process is the same as before

###### Train Word2Vec

In [None]:
np.max([len(text.split()) for text in dtf["text_clean"]]) /2 

In [None]:
lst_bigrams_stopwords = ["of","with","without","and","or","the","a"]

In [None]:
lst_corpus, nlp = fit_w2v(corpus=dtf["text_clean"], ngrams=1, min_count=1, size=300, window=18, sg=0, epochs=30, 
                          lst_bigrams_stopwords=lst_bigrams_stopwords)

In [None]:
plot_w2v(nlp, plot_type="2d", word=word, top=20, figsize=(10,5))

In [None]:
embeddings = vocabulary_embeddings(dic_vocabulary, nlp, dim_space=300)

###### Train

In [None]:
# this takes a while
dic_lstm = dl_text_classif(dic_y_mapping, embeddings, X_train, y_train, X_test, y_test, 
                           model=None, epochs=10, batch_size=256)

predicted_prob, predicted = dic_lstm["predicted_prob"], dic_lstm["predicted"]

###### Evaluate

In [None]:
evaluate_multi_classif(y_test, predicted, predicted_prob, figsize=(15,5))

### 6 - Model Desing & Testing (Embedding + Clustering)

###### Plan:
- I will use the nlp model to create clusters for the classes
- then convert news into vectors and calculate distances

######  Create Clusters

In [None]:
# from the Baseline step
dic_clusters = {}
for y,lst_keywords in dic_features_selection.items():
    if y != "ALL":
        lst_grams = []
        for gram in lst_keywords:
            if len(gram.split())>1:
                lst_grams.append("_".join(gram.split()))
            else:
                lst_grams.append(gram)
        dic_clusters.update({y:lst_grams})

In [None]:
for k,v in dic_clusters.items():
    print(k, ": ", v[0:5], "...")

###### Fit PCA

In [None]:
pca = fit_pca_w2v(corpus=dtf_train["text_clean"], nlp=nlp)

###### Predict Clusters

In [None]:
predicted_prob, predicted = predict_clusters_w2v(corpus=dtf_test["text_clean"], dic_clusters=dic_clusters, 
                                                 nlp=nlp, pca=None)

print("Accuracy (overall correct predictions):",  round(metrics.accuracy_score(y_test, predicted),3))
print("Detail:")
print(metrics.classification_report(y_test, predicted))

###### Evaluate

In [None]:
evaluate_multi_classif(y_test, predicted, predicted_prob, figsize=(15,5))