In [1]:
import pandas as pd
import numpy as np
import spacy
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('amazon_ten_topic_data.csv', names=['ad'])
df.dropna(inplace=True)
df = df.sample(frac=1)
df.describe()

Unnamed: 0,ad
count,10000
unique,9934
top,USB Printer with 3 Year Printer Warranty
freq,6


In [3]:
ads = pd.DataFrame(df['ad'].unique(), columns = ['ad'])
ads.describe()

Unnamed: 0,ad
count,9934
unique,9934
top,Justice Wristlet Tiny Mini Mermaid Backpack
freq,1


In [4]:
ads.head()

Unnamed: 0,ad
0,Rip Curl Women's Quartz Sport Watch with Silic...
1,MSI Full HD Non-Glare 1ms 1920 x 1080 144Hz Re...
2,WOMENS DK. SAND BLUE COLOR DENIM STRETCH JEANS...
3,Phoenix Home Kenitra Contemporary Side Chair w...
4,Pangda 10 Pack Tyre Gauge Tyre Tread Depth Gau...


In [5]:
ads.tail()

Unnamed: 0,ad
9929,"K-Swiss Women's Court CHESWICK SDE Trainers, (..."
9930,National Public Seating 1300 Series Steel Fram...
9931,DL1961 Women's Bella Vintage Slim Jeans
9932,mDesign Foaming Glass Soap Dispenser Pump for ...
9933,Christian Van Sant Women's CV0212 Sultry Analo...


In [6]:
nlp = spacy.load('en_core_web_lg')

In [7]:
def stopwords(text):
    """
    Function for removing 
        - stopwords,
        - punctuation,
        - numbers / digits
        - words containing numbers
    """
    doc = nlp(text)
    for token in doc:
        text = [token.text for token in doc if 
                not token.is_stop 
                and not token.is_punct 
                and not token.is_digit]
        
        
    # joining the list of words with space separator
    joined_text = " ".join(text)
    # removing words that contain any sort of numbers, like 'G2420-BK' or 'G1W40A#BGJ '
    re_text = re.sub(r"\S*\d\S*", '', joined_text).strip()
    
    return re_text

In [8]:
ads['SW'] = ads['ad'].apply(stopwords)

In [9]:
ads.head()

Unnamed: 0,ad,SW
0,Rip Curl Women's Quartz Sport Watch with Silic...,Rip Curl Women Quartz Sport Watch Silicone Str...
1,MSI Full HD Non-Glare 1ms 1920 x 1080 144Hz Re...,MSI HD Non Glare x Refresh Rate USB DP HDMI ...
2,WOMENS DK. SAND BLUE COLOR DENIM STRETCH JEANS...,WOMENS DK SAND BLUE COLOR DENIM STRETCH JEANS
3,Phoenix Home Kenitra Contemporary Side Chair w...,Phoenix Home Kenitra Contemporary Chair Maple ...
4,Pangda 10 Pack Tyre Gauge Tyre Tread Depth Gau...,Pangda Pack Tyre Gauge Tyre Tread Depth Gauge ...


In [10]:
print(len(ads['SW'][2]), ' ====', ads['SW'][2])

45  ==== WOMENS DK SAND BLUE COLOR DENIM STRETCH JEANS


In [11]:
ads['SW']=ads['SW'].str.replace("  "," ")
ads['SW'][3]

'Phoenix Home Kenitra Contemporary Chair Maple Wood Legs Snow White Set'

# W2V

## Identifying similar vectors in each document
The best way to expose vector relationships is through the `.similarity()` method of Doc tokens.

In [12]:
# removing multiple occurances
def remove_occ(duplicate):
    """
    Function for removing multiple occurances of the same word vectors
    """
    test_for_occurancies = []
    final_sim = [] 

    for num in duplicate: 
        if num[2] not in test_for_occurancies: 
            test_for_occurancies.append(num[2])
            final_sim.append(num) 
    return final_sim

In [13]:
def word_vectors(text):
    global cc
    tokens = nlp(text)
    
    # list of simular vectors - each word compared with one another to present a similuraity probability
    sims = []
    # counts number of words i.e. tokens in document
    counter = 0
    
    # Iterate through token combinations:
    for token1 in tokens:
        counter += 1
        for token2 in tokens:
            sims.append([token1.text, token2.text, token1.similarity(token2)])
    
    
    # sorting by similarity for better value manipulation later on
    sorted_sims = sorted(sims, key=lambda sim: sim[2], reverse=True)
    
    # removing vectors with  a perfect (1.0) similarity 
    for sim in sorted_sims:
        if sim[2] == 1.0:
            sorted_sims.remove(sim)
            
            
    # word vector with no perfect simularities and duplicates
    word_vector = remove_occ(sorted_sims)
    
    """ 
    taking only first few words out of all of them
    deciding on feaure: len(word_vector)/counter)
    for instance 36 / 9 = 4
    """
    test_for_occurancies = []
    final_words = []

    for i in range(int(len(word_vector)/counter)):
        if word_vector[i][0] not in test_for_occurancies: 
            test_for_occurancies.append(word_vector[i][0])
            final_words.append(word_vector[i][0])
        if word_vector[i][1] not in test_for_occurancies:
            test_for_occurancies.append(word_vector[i][1])
            final_words.append(word_vector[i][1])

    return(final_words)

### removing documents with whitespaces

In [14]:
ads['SW'].replace('', np.nan, inplace=True)
ads.dropna(subset=['SW'], inplace=True)
ads.describe()

Unnamed: 0,ad,SW
count,9932,9932
unique,9932,9597
top,Justice Wristlet Tiny Mini Mermaid Backpack,New Grade LCD Panel Days Warranty
freq,1,10


In [15]:
ads['word_vectors'] = ads['SW'].apply(word_vectors)

In [16]:
ads

Unnamed: 0,ad,SW,word_vectors
0,Rip Curl Women's Quartz Sport Watch with Silic...,Rip Curl Women Quartz Sport Watch Silicone Str...,"[Curl, Women, White, Silicone, Strap, Sport]"
1,MSI Full HD Non-Glare 1ms 1920 x 1080 144Hz Re...,MSI HD Non Glare x Refresh Rate USB DP HDMI Sm...,"[HD, USB, HDMI, Headset, MSI, DP]"
2,WOMENS DK. SAND BLUE COLOR DENIM STRETCH JEANS...,WOMENS DK SAND BLUE COLOR DENIM STRETCH JEANS,"[DK, DENIM, JEANS, BLUE, COLOR]"
3,Phoenix Home Kenitra Contemporary Side Chair w...,Phoenix Home Kenitra Contemporary Chair Maple ...,"[Home, Maple, Wood, Chair, Legs, White]"
4,Pangda 10 Pack Tyre Gauge Tyre Tread Depth Gau...,Pangda Pack Tyre Gauge Tyre Tread Depth Gauge ...,"[Pack, Tyre, Tread]"
...,...,...,...
9929,"K-Swiss Women's Court CHESWICK SDE Trainers, (...",K Swiss Women Court CHESWICK SDE Trainers Blac...,"[Swiss, Black, White, Women]"
9930,National Public Seating 1300 Series Steel Fram...,National Public Seating Series Steel Frame Uph...,"[Public, Seat, Chair, Seating, Upholstered, Na..."
9931,DL1961 Women's Bella Vintage Slim Jeans,Women Bella Vintage Slim Jeans,"[Bella, Slim, Jeans]"
9932,mDesign Foaming Glass Soap Dispenser Pump for ...,mDesign Foaming Glass Soap Dispenser Pump Kitc...,"[Foaming, Kitchen, Bathroom, Vanities, Dispens..."
