### Imports

In [1]:
# Import necessary packages
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import bs4

import pandas as pd
import numpy as np

import gensim


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import linear_model

import nltk.data

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
import re
from nltk.corpus import stopwords
import wikipedia


## My Approach


I thought of following the below steps:

    1. Scrape the webpages
        1. Get the Dinosaur names and their links
        2. Open each links to get the content of each dinosaur names
        
    2. Represent all the above scraping part into Dataframe
    
    3. Do Preprocessing on the contents that we have got ( remove stopwords, punctuations etc.)
    
    4. Use Word embeddings (Word2Vec model) - using CBOW approach - because it will give the the context of words which in respect to tagret words.
    
    5. Get the TFIDF matrix ( using this because - the top ranked terms will be used in combination with word embeddings of each of this words, and create seperate features of all these values)
         1.Find the top ranked terms from TFIDF matrix
    
    6. Start Forecast 
        1. As I have to find the number of occurences of Dinosaur words. I will consider the target variable as count of dinosaur words that are already present in the content and then compare our predictions on it.
        2. In order to get the predictions, I generated different features that are based on the Word embeddings that I created, and the TfIdf matrix top - ranked terms.
        3. Use these features to Train-Test data and predict the outcome
        
        

## Scrapping the Wikipedia pages

#### Functions to save the files for checking purpose

In [2]:
# To open the link
def open_link(baseurl,newurl):
    rawtext = urllib.request.urlopen(baseurl+newurl)
    soup = bs4.BeautifulSoup(rawtext,'lxml')
    return soup

# To open the file in 'w' mode
def create_raw_file(file):
    
    # open file
    with open(file, 'w', encoding='utf-8') as raw_category_file:
        raw_category_file.close()
    
    
# To open the file in 'a' mode
def create_file(file,dino_list):
    #category file include news-categories
    with open(file,'a', encoding='utf-8') as file:
        file.write(dino_list + '\n')
    file.close()

#### Use this base url

In [3]:
baseurl='https://en.wikipedia.org'

#### Scrapping more inside the links of each dinosaur pages to get context

In [4]:
index_page=open_link(baseurl,'/wiki/List_of_dinosaur_genera')

dinosours_data=index_page.find_all('i')

#print(dinosours_data)
dinosour_titles_urllink = []

dinosour_final_list = []

df = pd.DataFrame()

file1 = 'dino_list.txt'
file2 = 'dino_url.txt'

create_raw_file(file1)
create_raw_file(file2)

for data in dinosours_data:
    dinosour_titles = []
    for subdata in data.findAll('a',href=True):
        title = subdata['title']
        dinosour_titles.append(title)
        url_link = baseurl + subdata['href']
        dinosour_titles.append(url_link)
        dinosour_final_list.append(dinosour_titles)
        create_file(file1,title)
        create_file(file2,url_link)
print(dinosour_final_list)

[['Tyrannosaurus', 'https://en.wikipedia.org/wiki/Tyrannosaurus'], ['Apatosaurus', 'https://en.wikipedia.org/wiki/Apatosaurus'], ['Nomen dubium', 'https://en.wikipedia.org/wiki/Nomen_dubium'], ['Nomen nudum', 'https://en.wikipedia.org/wiki/Nomen_nudum'], ['Nomina dubia', 'https://en.wikipedia.org/wiki/Nomina_dubia'], ['Tyrannosaurus', 'https://en.wikipedia.org/wiki/Tyrannosaurus'], ['Nomen nudum', 'https://en.wikipedia.org/wiki/Nomen_nudum'], ['Nomen oblitum', 'https://en.wikipedia.org/wiki/Nomen_oblitum'], ['Nomen dubium', 'https://en.wikipedia.org/wiki/Nomen_dubium'], ['Hadrosaurus', 'https://en.wikipedia.org/wiki/Hadrosaurus'], ['Allosaurus', 'https://en.wikipedia.org/wiki/Allosaurus'], ['Aachenosaurus', 'https://en.wikipedia.org/wiki/Aachenosaurus'], ['Aardonyx', 'https://en.wikipedia.org/wiki/Aardonyx'], ['Abdallahsaurus', 'https://en.wikipedia.org/wiki/Abdallahsaurus'], ['Giraffatitan', 'https://en.wikipedia.org/wiki/Giraffatitan'], ['Abelisaurus', 'https://en.wikipedia.org/wiki/

#### Finding the unique values of dinosaur names


In [5]:
## Finding the unique values of dinosaur names

# set will give us unordered dino names... and so it will give random dino names to me...
unique_dino_names = set()
for d in dinosour_final_list:
    unique_dino_names.add(d[0])

#print(unique_dino_names)
print(len(unique_dino_names))

1511


#### I am considering first 50 pages due to computer limitations i.e. to open first 50 dino names from the set() that I created above

In [6]:
num_pages=50

#### Now scrape to get the contents of each dino link page

In [7]:
allpgcontnt = []
links = []
i =0
unique_dino_names_list = list(unique_dino_names)
df = pd.DataFrame()

file3 = 'content.txt'

for eachnamelink in unique_dino_names_list[0:num_pages]:
    links.append(eachnamelink)
    i=i+1
    print('count :',i)
    try:
        
        pages = wikipedia.page(eachnamelink)
        print('#####dinosour title ######',eachnamelink)
        create_file(file3,pages.content)
        allpgcontnt.append(pages.content)
        
    except (wikipedia.exceptions.PageError,wikipedia.exceptions.DisambiguationError) as e:
        print('Something went wrong')

        #create_file('content.txt',pages.content)


count : 1
#####dinosour title ###### Jiutaisaurus
count : 2
#####dinosour title ###### Acrocanthosaurus
count : 3
#####dinosour title ###### Gannansaurus
count : 4
#####dinosour title ###### Pachysaurops
count : 5
#####dinosour title ###### Xiongguanlong
count : 6
#####dinosour title ###### Europasaurus
count : 7
#####dinosour title ###### Zhanghenglong
count : 8
#####dinosour title ###### Macrodontophion
count : 9
#####dinosour title ###### Propanoplosaurus
count : 10
#####dinosour title ###### Europelta
count : 11
#####dinosour title ###### Ambopteryx
count : 12
#####dinosour title ###### Oceanotitan
count : 13
#####dinosour title ###### Agujaceratops
count : 14
#####dinosour title ###### Liubangosaurus
count : 15
#####dinosour title ###### Luoyanggia
count : 16
#####dinosour title ###### Yixianosaurus
count : 17
#####dinosour title ###### Atlantosaurus
count : 18
#####dinosour title ###### Wannanosaurus
count : 19
#####dinosour title ###### Kinnareemimus
count : 20
#####dinosour tit

### Representing everything in Dataframe

In [8]:
df['Dinosour_Name'] = links
df['Content'] = allpgcontnt

df.head()

Unnamed: 0,Dinosour_Name,Content
0,Jiutaisaurus,Jiutaisaurus is a genus of sauropod dinosaur f...
1,Acrocanthosaurus,Acrocanthosaurus ( ak-ro-KAN-thə-SAWR-əs; mean...
2,Gannansaurus,Gannansaurus is an extinct genus of somphospon...
3,Pachysaurops,"Plateosaurus (probably meaning ""broad lizard"",..."
4,Xiongguanlong,"Xiongguanlong (""Grand Pass dragon"") is a genus..."


##### Word2Vec model requires list of lists to give as input to the model. Therefore, the below code cell creates list of list of words from the  declared function content_to_wordlist()


The function content_to_wordlist() does necessary preprocessing by removing the stop words or non-letters etc and creates tokens of words from the content.

In [9]:
cleaned_sentence = []

def cleaned_data(x):
    return " ".join(x)

def content_to_wordlist( content, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # removing stop words.  Returns a list of words.
    # 1. Remove non-letters
    content_text = re.sub("[^a-zA-Z]"," ", content)
    
    content_text = content_text.replace("  ", " ")
    cleaned_sentence.append(content_text.lower())
    
    words = content_text.lower().split()
    #
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    return(words)

#### Considering a new dataframe to do further operations

In [10]:
dataframe = df.copy()

In [11]:

dataframe["cleaned"]=dataframe["Content"].map(content_to_wordlist)
dataframe["cleaned"]=dataframe["cleaned"].map(cleaned_data)
dataframe.head()

Unnamed: 0,Dinosour_Name,Content,cleaned
0,Jiutaisaurus,Jiutaisaurus is a genus of sauropod dinosaur f...,jiutaisaurus is a genus of sauropod dinosaur f...
1,Acrocanthosaurus,Acrocanthosaurus ( ak-ro-KAN-thə-SAWR-əs; mean...,acrocanthosaurus ak ro kan th sawr s meaning h...
2,Gannansaurus,Gannansaurus is an extinct genus of somphospon...,gannansaurus is an extinct genus of somphospon...
3,Pachysaurops,"Plateosaurus (probably meaning ""broad lizard"",...",plateosaurus probably meaning broad lizard oft...
4,Xiongguanlong,"Xiongguanlong (""Grand Pass dragon"") is a genus...",xiongguanlong grand pass dragon is a genus of ...


In [12]:
## Preparing the input for Word2Vec model

all_content = dataframe['cleaned']

one_single_content =""
for each_content in all_content:
    one_single_content += ' '.join(all_content)
print(one_single_content[:100])

jiutaisaurus is a genus of sauropod dinosaur from the quantou formation of china jiutaisaurus was a 


##### Creates a list 'sentences' to give the input to Word2Vec model.

In [13]:
raw_sentences = tokenizer.tokenize(one_single_content)

sentences = []
remove_stopwords = True
for raw_sentence in raw_sentences:
    # If a sentence is empty, skip it
    if len(raw_sentence) > 0:
        # Otherwise, call review_to_wordlist to get a list of words
        sentences.append( content_to_wordlist( raw_sentence, \
          remove_stopwords ))
print(len(sentences))

1


In [14]:
print(sentences[:1])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


## Word2Vec

CBOW is learning to predict the word by the context. A context may be single word or multiple word for a given target words.
for example “The cat jumped over the puddle.”

So one approach is to treat {“The”, “cat”, ’over”, “the’, “puddle”} as a context and from these words, be able to predict or generate the center word “jumped”. This type of model we call a Continuous Bag of Words (CBOW) Model.


Here as I want to forecast occurence of a single word dinosaur, however I dont want to use the count of word dinosaur for forecasting. So I thought of using wrod embeddings (CBOW) because, it will help me to predict the word by the context by giving word vectors.

In [15]:
# Set values for various parameters
num_features = 50    # Word vector dimensionality                      
min_count = 1   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context_window_size = 2          # Context window size      
sg = 0 # This states that I am using CBOW 

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_count, \
            window = context_window_size,sg=0)

model.most_similar("dinosaur")

Training model...


  del sys.path[0]


[('representing', 0.4905478358268738),
 ('la', 0.45430469512939453),
 ('abler', 0.42828869819641113),
 ('midsized', 0.4275635778903961),
 ('soft', 0.42627406120300293),
 ('jaws', 0.42160624265670776),
 ('transforms', 0.4066582918167114),
 ('lindsay', 0.4027180075645447),
 ('absence', 0.40259355306625366),
 ('knollenmergel', 0.4003618657588959)]

## TF-IDF Generation

In [16]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english",min_df = 3)
X_tfidf = tfidf_vectorizer.fit_transform(all_content)
print(X_tfidf.shape)

(50, 1560)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [17]:
terms= tfidf_vectorizer.get_feature_names()

### Rank Terms based on TFIDF

##### The below two cells are just the trial that I was doing to check top ranked words in all documents. I am not using this in my next code part. It's just I wanted to check it.

In [18]:
import operator
def rank_terms( X_tfidf, terms ):
    # get the sums over each column
    sums = X_tfidf.sum(axis=0)
    #print(sums)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    #print(weights)
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

To find out top 20 tfidf terms from all the documents

In [19]:
ranking = rank_terms( X_tfidf, terms )
#print(ranking)

for i, pair in enumerate( ranking[0:20]):
    print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )

01. species (3.03)
02. genus (2.93)
03. dinosaur (2.88)
04. formation (2.70)
05. known (2.63)
06. cretaceous (2.51)
07. type (2.36)
08. skull (2.26)
09. specimen (2.23)
10. bones (2.18)
11. vertebrae (2.07)
12. bone (2.02)
13. long (1.97)
14. sauropod (1.95)
15. china (1.85)
16. described (1.81)
17. holotype (1.74)
18. late (1.74)
19. references (1.73)
20. partial (1.67)


#### Rank top terms from each documents

But the below function calculates the top terms from each documents, which will be used to generate features for my prediction model. 

In [20]:
def get_top_ranked_topics( terms, tfidf_array, term_index,top ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( tfidf_array[term_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for i in top_indices[0:top]:
        top_terms.append( terms[i] )
    return top_terms



In [21]:
X_tfidf.shape

(50, 1560)

In [22]:
num_of_top_terms =10

##### this below cell gives the top 10 terms of each document

In [23]:
tfidf_rank =[]
for i in range(num_pages):
    tfidf_rank.append( get_top_ranked_topics( terms, X_tfidf.toarray(), i, num_of_top_terms ) )
    str_descriptor = ", ".join( tfidf_rank[i] )
    print("Term %02d: %s" % ( i+1, str_descriptor ) )

Term 01: li, sauropod, china, entry, mailing, dinosaur, geology, eighteen, list, cretaceous
Term 02: theropod, prey, bones, footprints, rose, texas, large, theropods, spines, formation
Term 03: vertebra, euhelopus, mid, shares, southern, characters, late, known, basin, indicating
Term 04: material, animals, von, limb, species, skeletons, germany, posture, growth, individuals
Term 05: china, tyrannosauroids, tyrannosaur, tyrannosauroid, city, skull, right, cretaceous, fossil, rex
Term 06: camarasaurus, bones, taxa, growth, fenestra, brachiosaurids, brachiosaurus, sauropod, vertebrae, process
Term 07: hadrosauroid, plesiomorphic, henan, hadrosaurid, right, features, hadrosauroids, xing, non, characters
Term 08: given, mailing, entry, dubious, list, nomen, dubium, tooth, genus, based
Term 09: nodosaurid, natural, osteoderms, specimen, cast, impressions, right, centimetres, authors, nodosaurids
Term 10: ankylosaur, nodosaurid, osteoderms, europe, partial, isolated, view, pubis, dorsal, tee

In [24]:
def tfidf_rank_method(x):
    return tfidf_rank[x.name]

In [25]:
len(tfidf_rank)

50

In [26]:
dataframe.shape

(50, 3)

##### This dataframe column tfidf_rank shows the list of each top ranked words which will be used with combination of Word2Vec model

- So the idea behind this is that for each word for eg. vertebra,animals ( from below df) a wordvector will be calculated i.e Word2Vec.wv['vertebra'].

- First -  my top ranked list has 10 terms and 50 documents so it will create 50 x 10 matrix ( I am showing that in df3 dataframe)

- Second - now as I have got seperate terms in each column , I will find the word vectors of each word i.e the Word2Vec.wv[<each word>] 

- Third - so it will create a new matrix 10 (documents) x 500 (terms) ( I am showing that in cols_df)

- Create a final_df by concatenating the cols_df and temp_df ( it contain only dinosaur name and target feature)

In [27]:
dataframe["tfidf_rank"]=dataframe.apply(tfidf_rank_method, axis=1)
dataframe.head()

Unnamed: 0,Dinosour_Name,Content,cleaned,tfidf_rank
0,Jiutaisaurus,Jiutaisaurus is a genus of sauropod dinosaur f...,jiutaisaurus is a genus of sauropod dinosaur f...,"[li, sauropod, china, entry, mailing, dinosaur..."
1,Acrocanthosaurus,Acrocanthosaurus ( ak-ro-KAN-thə-SAWR-əs; mean...,acrocanthosaurus ak ro kan th sawr s meaning h...,"[theropod, prey, bones, footprints, rose, texa..."
2,Gannansaurus,Gannansaurus is an extinct genus of somphospon...,gannansaurus is an extinct genus of somphospon...,"[vertebra, euhelopus, mid, shares, southern, c..."
3,Pachysaurops,"Plateosaurus (probably meaning ""broad lizard"",...",plateosaurus probably meaning broad lizard oft...,"[material, animals, von, limb, species, skelet..."
4,Xiongguanlong,"Xiongguanlong (""Grand Pass dragon"") is a genus...",xiongguanlong grand pass dragon is a genus of ...,"[china, tyrannosauroids, tyrannosaur, tyrannos..."


### Forecast starts

In [28]:
def calculate_dinosaur_count(x):
    return x.count("dinosaur")

In [29]:
dataframe["target"]=dataframe["cleaned"].map(calculate_dinosaur_count)
dataframe.columns = [ 'Dinosour_Name', 'Content', 'cleaned', "tfidf_rank","target"]
dataframe.head()

Unnamed: 0,Dinosour_Name,Content,cleaned,tfidf_rank,target
0,Jiutaisaurus,Jiutaisaurus is a genus of sauropod dinosaur f...,jiutaisaurus is a genus of sauropod dinosaur f...,"[li, sauropod, china, entry, mailing, dinosaur...",3
1,Acrocanthosaurus,Acrocanthosaurus ( ak-ro-KAN-thə-SAWR-əs; mean...,acrocanthosaurus ak ro kan th sawr s meaning h...,"[theropod, prey, bones, footprints, rose, texa...",9
2,Gannansaurus,Gannansaurus is an extinct genus of somphospon...,gannansaurus is an extinct genus of somphospon...,"[vertebra, euhelopus, mid, shares, southern, c...",1
3,Pachysaurops,"Plateosaurus (probably meaning ""broad lizard"",...",plateosaurus probably meaning broad lizard oft...,"[material, animals, von, limb, species, skelet...",21
4,Xiongguanlong,"Xiongguanlong (""Grand Pass dragon"") is a genus...",xiongguanlong grand pass dragon is a genus of ...,"[china, tyrannosauroids, tyrannosaur, tyrannos...",1


In [30]:
df_short = dataframe.copy()
df_short = df_short.drop(columns=["Content","cleaned"])
df_short.head()

Unnamed: 0,Dinosour_Name,tfidf_rank,target
0,Jiutaisaurus,"[li, sauropod, china, entry, mailing, dinosaur...",3
1,Acrocanthosaurus,"[theropod, prey, bones, footprints, rose, texa...",9
2,Gannansaurus,"[vertebra, euhelopus, mid, shares, southern, c...",1
3,Pachysaurops,"[material, animals, von, limb, species, skelet...",21
4,Xiongguanlong,"[china, tyrannosauroids, tyrannosaur, tyrannos...",1


In [31]:
ip_cols =[]
for i in range(1, num_of_top_terms+1):
    col = "t" + str(i)
    ip_cols.append(col)

In [44]:
df3 = pd.DataFrame(df_short['tfidf_rank'].values.tolist(), columns=ip_cols)
df3.shape

(50, 10)

In [33]:
df_long = df_short.merge(df3, left_index=True, right_index=True)
df_long.head()

Unnamed: 0,Dinosour_Name,tfidf_rank,target,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10
0,Jiutaisaurus,"[li, sauropod, china, entry, mailing, dinosaur...",3,li,sauropod,china,entry,mailing,dinosaur,geology,eighteen,list,cretaceous
1,Acrocanthosaurus,"[theropod, prey, bones, footprints, rose, texa...",9,theropod,prey,bones,footprints,rose,texas,large,theropods,spines,formation
2,Gannansaurus,"[vertebra, euhelopus, mid, shares, southern, c...",1,vertebra,euhelopus,mid,shares,southern,characters,late,known,basin,indicating
3,Pachysaurops,"[material, animals, von, limb, species, skelet...",21,material,animals,von,limb,species,skeletons,germany,posture,growth,individuals
4,Xiongguanlong,"[china, tyrannosauroids, tyrannosaur, tyrannos...",1,china,tyrannosauroids,tyrannosaur,tyrannosauroid,city,skull,right,cretaceous,fossil,rex


#### Below two cells calculate word vector for each word

In [34]:
def get_word_vec(x):
    return model.wv[x]

In [35]:
ip_cols =[]
for i in range(1, num_of_top_terms+1):
    col = "t" + str(i)
    ip_cols.append(col)
    df_long[col] = df_long[col].map(get_word_vec)
df_long.head()

Unnamed: 0,Dinosour_Name,tfidf_rank,target,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10
0,Jiutaisaurus,"[li, sauropod, china, entry, mailing, dinosaur...",3,"[-0.008703152, -0.0001660559, 0.0020076404, -0...","[-0.0014460513, -0.007113537, 0.011040891, 0.0...","[0.0020031822, 0.0067751287, -0.0010514023, -0...","[-0.006622589, -0.0013424736, 0.00828771, -0.0...","[-0.0034199744, 0.0010439792, 0.004772311, 0.0...","[-0.0054585026, 0.0053561227, 0.010639078, 0.0...","[0.0015393439, 0.009300169, -0.009908707, -0.0...","[-0.003369663, -0.0012790837, -0.0010941069, -...","[0.0010499741, 0.0037522588, 0.0043581454, 0.0...","[0.008062972, -0.009854394, -0.006828905, -0.0..."
1,Acrocanthosaurus,"[theropod, prey, bones, footprints, rose, texa...",9,"[0.0033310354, 0.008749673, 0.0062799826, 0.00...","[-0.005493202, -0.010383192, 0.004714991, 0.00...","[-9.6452764e-05, 0.0017368955, -0.008461526, -...","[0.0042897193, 0.0065922267, 0.010245321, -0.0...","[-0.00037471237, 0.0024529952, 0.0014172881, 0...","[0.00061023154, 0.00268959, -0.005486665, 0.00...","[0.005841105, 0.008258192, -0.0012092775, -0.0...","[0.0038866969, 0.0072074174, -0.0045671887, 0....","[0.008793088, -0.0017526482, 0.006015234, -0.0...","[0.0061031375, 0.0035557991, 0.00874045, 0.005..."
2,Gannansaurus,"[vertebra, euhelopus, mid, shares, southern, c...",1,"[-0.00873571, 0.009275186, -0.0037997076, -0.0...","[0.0062745106, -0.00594248, 0.008685108, 0.001...","[-0.0005643376, 0.0067243045, -0.0060712346, 0...","[-0.004861653, 0.0008823294, 0.0017011497, -0....","[0.009852141, -0.004580853, 0.0044415337, 0.00...","[-0.0005642553, 0.0070317206, 0.0037736846, 0....","[-0.0007822251, -0.0015254677, 0.008783886, 0....","[-0.0006488418, -0.007947108, 0.00975566, -0.0...","[-0.0037810877, 0.0048288195, -0.008331582, -0...","[0.0091765905, -0.0040588705, -0.0078060175, -..."
3,Pachysaurops,"[material, animals, von, limb, species, skelet...",21,"[0.0071044504, -0.009191907, 0.000433386, 0.00...","[-0.0002303709, -0.0048747147, 0.00562177, -0....","[0.008674698, -0.0034388176, 0.001694855, 0.00...","[-0.003685667, 0.007436367, 0.0074198795, -0.0...","[0.011263338, -0.0034976806, 0.00031262735, -0...","[-0.005053603, 0.0007246767, -0.0015069584, 0....","[-0.00866919, -0.005430887, 7.4599106e-05, -0....","[-0.0075323666, 0.001843284, -0.0048758085, -0...","[-0.009223508, 0.0023607998, 0.0036196194, 0.0...","[-0.008864173, -0.007065329, -0.0076381424, 0...."
4,Xiongguanlong,"[china, tyrannosauroids, tyrannosaur, tyrannos...",1,"[0.0020031822, 0.0067751287, -0.0010514023, -0...","[-0.000801846, -0.0008947225, -0.0057692425, 0...","[0.0029446832, -0.0045438446, -0.0009876201, 0...","[0.00031460484, 0.0016246911, 0.009568301, 0.0...","[0.0022986007, 0.00966127, 0.007996967, 0.0032...","[-0.0071080658, 0.0044984845, -0.0006328861, -...","[0.0018523005, -0.0037206654, -0.0067740777, 0...","[0.008062972, -0.009854394, -0.006828905, -0.0...","[-0.0016524506, -0.0051935217, 0.0043878173, -...","[-0.009546558, -0.008270859, -0.008349231, 0.0..."


#### Now splitting/mapping each list of vector values to seperate columns

In [36]:
cols_df = pd.DataFrame()
for i in ip_cols:
    colnames =[]
    for j in range(1,num_features+1):
        col = i + "_" + str(j)
        colnames.append(col)
    temp_df = pd.DataFrame(df_long[i].values.tolist(),columns=colnames)
    cols_df = pd.concat([cols_df, temp_df], axis=1)
cols_df.head()

Unnamed: 0,t1_1,t1_2,t1_3,t1_4,t1_5,t1_6,t1_7,t1_8,t1_9,t1_10,...,t10_41,t10_42,t10_43,t10_44,t10_45,t10_46,t10_47,t10_48,t10_49,t10_50
0,-0.008703,-0.000166,0.002008,-0.006755,0.003963,-0.002143,0.003032,0.009123,-0.001173,-0.003588,...,-0.004983,0.001222,0.004389,-0.004061,0.006285,0.005099,-0.009933,0.006327,-0.008698,-0.00176
1,0.003331,0.00875,0.00628,0.004833,-0.003235,-0.003018,0.007442,0.008552,-0.009889,-0.006838,...,-0.003832,0.001065,-0.005445,-0.003076,-0.00717,-0.000224,-0.002739,-0.002719,0.005719,0.007731
2,-0.008736,0.009275,-0.0038,-0.00939,-0.005893,0.003903,0.003581,-0.000585,-0.005245,-0.010037,...,0.001198,-0.001291,0.000281,0.000109,-0.007237,-0.008872,-0.008238,0.001097,0.000521,0.009116
3,0.007104,-0.009192,0.000433,0.006982,-0.001122,-0.009042,0.007791,-0.00637,0.002291,-0.005849,...,-0.000153,0.007777,-0.005909,0.003201,0.005702,0.001017,0.001614,0.000928,0.006446,0.000143
4,0.002003,0.006775,-0.001051,-0.006629,-0.003277,0.002399,0.005618,0.007632,0.001566,0.000476,...,0.001671,0.009109,-0.007238,-0.008904,-0.000514,0.007569,-0.000668,0.008565,0.00251,0.002103


In [37]:
temp_df = df_long[["Dinosour_Name", "target"]]
final_df = pd.concat([temp_df, cols_df], axis=1)

In [38]:
final_df.set_index("Dinosour_Name", inplace=True)

### Do Predictions

In [39]:
X = final_df.iloc[:, 1:]
Y = final_df["target"]

In [40]:
from sklearn import metrics

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, train_size=0.7, test_size=0.3)

# Using Decision Tree Classifier and calling fit function
decisionTree = tree.DecisionTreeClassifier()
decisionTree.fit(X_train, Y_train)

Y_predict = decisionTree.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(Y_test, Y_predict)
print("Accuracy Score: " + str(accuracy))
print(metrics.classification_report(Y_test, Y_predict))



Accuracy Score: 0.26666666666666666
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         2
          1       0.38      0.60      0.46         5
          2       0.25      0.50      0.33         2
          3       0.00      0.00      0.00         3
          4       0.00      0.00      0.00         1
          6       0.00      0.00      0.00         1
          8       0.00      0.00      0.00         1
         14       0.00      0.00      0.00         0
         40       0.00      0.00      0.00         0

avg / total       0.16      0.27      0.20        15



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [41]:
log_reg = linear_model.LogisticRegression()

log_reg.fit(X_train,Y_train)

Y_predict = log_reg.predict(X_test)

accuracy = metrics.accuracy_score(Y_test, Y_predict)
print("Accuracy Score: " + str(accuracy))
print(metrics.classification_report(Y_test, Y_predict))

Accuracy Score: 0.3333333333333333
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         2
          1       0.33      1.00      0.50         5
          2       0.00      0.00      0.00         2
          3       0.00      0.00      0.00         3
          4       0.00      0.00      0.00         1
          6       0.00      0.00      0.00         1
          8       0.00      0.00      0.00         1

avg / total       0.11      0.33      0.17        15



  'precision', 'predicted', average, warn_for)
