# Part 1 Pandas Review

In [1]:
import pandas as pd
import numpy as np

In [3]:
#Creating DataFrame from list
x = [[11,12,13],[14,15,16]]
df = pd.DataFrame(x,index=['1st','2nd'], columns=['First','Second','Third'])
print(df)

     First  Second  Third
1st     11      12     13
2nd     14      15     16


In [4]:
type(df.index)

pandas.core.indexes.base.Index

In [5]:
#Column Selection, Addition and Deletion
print(df['First']) #Selection
print("")
df['Fourth'] = pd.Series([45,46],index=df.index) #Addition
print(df)
print("")
df['Sum of 1st & 2nd'] = df['First']+df['Second'] #Addition relating exsiting columns
print(df)
print("")
del df['Second'] #Deletion
print(df)

1st    11
2nd    14
Name: First, dtype: int64

     First  Second  Third  Fourth
1st     11      12     13      45
2nd     14      15     16      46

     First  Second  Third  Fourth  Sum of 1st & 2nd
1st     11      12     13      45                23
2nd     14      15     16      46                29

     First  Third  Fourth  Sum of 1st & 2nd
1st     11     13      45                23
2nd     14     16      46                29


In [6]:
#Row Selection, Addition and Deletion
x = [[11,12,13],[14,15,16]]
df = pd.DataFrame(x,index=['1st','2nd'], columns=['First','Second','Third'])
print(df.loc['1st']) #Selection
print("")
print(df.iloc[0])#Selection with index number
print("")
df2 = pd.DataFrame([[100,101,102],[201,202,203]],index = ['1st','4th'],columns=['First','Second','Third'])
df = df.append(df2)
print(df)
print("")
#df = df.drop('3rd')
print(df)

First     11
Second    12
Third     13
Name: 1st, dtype: int64

First     11
Second    12
Third     13
Name: 1st, dtype: int64

     First  Second  Third
1st     11      12     13
2nd     14      15     16
1st    100     101    102
4th    201     202    203

     First  Second  Third
1st     11      12     13
2nd     14      15     16
1st    100     101    102
4th    201     202    203


In [7]:
df = df.reset_index()
print(df.head())
del df['index']
print(df.head())

  index  First  Second  Third
0   1st     11      12     13
1   2nd     14      15     16
2   1st    100     101    102
3   4th    201     202    203
   First  Second  Third
0     11      12     13
1     14      15     16
2    100     101    102
3    201     202    203


In [8]:
#Replacing data
data = [[140,90,'not accepted'],[190,88,'accepted'],[174,81,'accepted'],[183,130,'not accepted'],[168,72,'accepted']]
df = pd.DataFrame(data ,columns=['Height','Weight','Result'])
print(df)
print("")
df['Result'].replace(['not accepted','accepted'],[0,1],inplace=True)
# use apply for more complex mapping with either lambda or defined functions i.e. weights<90 vs weights>90
print(df)

   Height  Weight        Result
0     140      90  not accepted
1     190      88      accepted
2     174      81      accepted
3     183     130  not accepted
4     168      72      accepted

   Height  Weight  Result
0     140      90       0
1     190      88       1
2     174      81       1
3     183     130       0
4     168      72       1


In [9]:
height_tf = lambda x: 'tall' if x>180 else 'short'
def height_tf(x):
    if x>180:
        return 'tall'
    else:
        return 'short'

In [10]:
df['Height'] = df['Height'].apply(height_tf)
df.head()

Unnamed: 0,Height,Weight,Result
0,short,90,0
1,tall,88,1
2,short,81,1
3,tall,130,0
4,short,72,1


In [11]:
#Converting from DataFrame to NumPy array
df_array = df.values
print(df_array)

[['short' 90 0]
 ['tall' 88 1]
 ['short' 81 1]
 ['tall' 130 0]
 ['short' 72 1]]


# part 2: Scrapping





In [12]:
import requests
from bs4 import BeautifulSoup
import spacy
import nltk

In [13]:
res = requests.get('http://quotes.toscrape.com/')

In [14]:
soup = BeautifulSoup(res.text,'lxml')

In [16]:
quotes = soup.find_all('div',{'class':'quote'}) # quotes here is a list of Tags (work with them as soups)

In [17]:
q = []
for quote in quotes:
    quote_txt = quote.find('span',{'class':'text'})
    author = quote.find('small',{'class':'author'})
    tags = quote.find_all('a',{'class':'tag'})
    tag = [t.text for t in tags]
    q.append({'quote':quote_txt.text, 'author':author.text, 'tags':tag})

In [18]:
dataset = pd.DataFrame(data = q)
dataset.head()

Unnamed: 0,author,quote,tags
0,Albert Einstein,“The world as we have created it is a process ...,"[change, deep-thoughts, thinking, world]"
1,J.K. Rowling,"“It is our choices, Harry, that show what we t...","[abilities, choices]"
2,Albert Einstein,“There are only two ways to live your life. On...,"[inspirational, life, live, miracle, miracles]"
3,Jane Austen,"“The person, be it gentleman or lady, who has ...","[aliteracy, books, classic, humor]"
4,Marilyn Monroe,"“Imperfection is beauty, madness is genius and...","[be-yourself, inspirational]"


In [21]:
nlp = spacy.load('en')

OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [20]:
score = np.zeros(len(dataset['quote']), dtype = np.float32)
for i, q in enumerate(dataset['quote']):
    for q2 in dataset['quote']:
        score[i] += nlp(q).similarity(nlp(q2))
    score[i] -= 1 # because q.similarity(q) = 1 (doesn't make a difference)

NameError: name 'nlp' is not defined

In [48]:
# unravel_index here has no use but it is important when score is multi-dimensional
most_similar = np.unravel_index(np.argmax(score,axis = None),score.shape)
print(most_similar)

(0,)


# Extra Spacy

In [86]:
# some of spacy's token attributes
fw = 12
sen = nlp(dataset['quote'].iloc[0])
print(f"{'word':{fw}} {'root':{fw}} {'POS':{fw-5}} {'Tags':{fw-6}} {'Shape':{fw-6}} {'Is Stop':{fw-8}}")
for t in sen:
        print(f"{t.text:{fw}} {t.lemma_:{fw}} {t.pos_:{fw-5}} {t.tag_:{fw-6}} {t.shape_:{fw-6}} {t.is_stop:<{fw-8}}")
        
# full documentation here
# https://spacy.io/api/token

word         root         POS     Tags   Shape  Is Stop
“            "            PUNCT   ``     “      0   
The          the          DET     DT     Xxx    0   
world        world        NOUN    NN     xxxx   0   
as           as           ADP     IN     xx     1   
we           -PRON-       PRON    PRP    xx     1   
have         have         VERB    VBP    xxxx   1   
created      create       VERB    VBN    xxxx   0   
it           -PRON-       PRON    PRP    xx     1   
is           be           VERB    VBZ    xx     1   
a            a            DET     DT     x      1   
process      process      NOUN    NN     xxxx   0   
of           of           ADP     IN     xx     1   
our          -PRON-       ADJ     PRP$   xxx    1   
thinking     thinking     NOUN    NN     xxxx   0   
.            .            PUNCT   .      .      0   
It           -PRON-       PRON    PRP    Xx     0   
can          can          VERB    MD     xxx    1   
not          not          ADV     RB     xx

In [71]:
spacy.explain('NN')

'noun, singular or mass'

In [72]:
from spacy import displacy
displacy.serve(sen, style='dep') # ent


    Serving on port 5000...
    Using the 'dep' visualizer



127.0.0.1 - - [07/Feb/2019 00:47:43] "GET / HTTP/1.1" 200 16094
127.0.0.1 - - [07/Feb/2019 00:47:44] "GET /favicon.ico HTTP/1.1" 200 16094



    Shutting down server on port 5000.

