
# Text Modeling - TextRazor

https://www.textrazor.com/tutorials

In [3]:
import utils
import pandas as pd
import textrazor

In [6]:
conn = utils.create_connection('../scraping538.db')
query = "SELECT * FROM ARTICLES_538 LIMIT 100"
articles_all = pd.read_sql(query, con=conn)

articles_all.shape

(100, 14)

## Article Selection

In [46]:
ARTICLE_ID = 0

print('title:        ', articles_all['title'][ARTICLE_ID])
print('url:          ', articles_all['url'][ARTICLE_ID])
print('filed under:  ', articles_all['filed_under'][ARTICLE_ID])
to_analyze = articles_all['article_text'][ARTICLE_ID]

title:         Can The Warriors Still Win Without Curry? And Should They Even Try?
url:           https://fivethirtyeight.com/features/can-the-warriors-still-win-without-curry-and-should-they-even-try/
filed under:   NBA


In [11]:
textrazor.api_key = 'fe398d3e3798693362b8be989459bb5fad9a391906fbbc1703bf3041'

In [112]:
client = textrazor.TextRazor(extractors=['entailments', "entities", "topics", "words","phrases"])

In [113]:
client.set_classifiers(["textrazor_newscodes"])

In [114]:
response = client.analyze(articles_all['article_text'][ARTICLE_ID])

In [115]:
print('Response Language: ', response.language)
print('------------------\nCoarse Topics:')
for top in response.coarse_topics():
        print(top.json)
print('------------------\nCoarse Topics:')

Response Language:  eng
------------------
Coarse Topics:
{'id': 0, 'label': 'Sports', 'wikiLink': 'http://en.wikipedia.org/Category:Sports', 'score': 1}
{'id': 1, 'label': 'Leisure', 'wikiLink': 'http://en.wikipedia.org/Category:Leisure', 'score': 0.705}
{'id': 2, 'label': 'Belief', 'wikiLink': 'http://en.wikipedia.org/Category:Belief', 'score': 0.03553}
{'id': 3, 'label': 'Culture', 'wikiLink': 'http://en.wikipedia.org/Category:Culture', 'score': 0.03412}
{'id': 4, 'label': 'Science', 'wikiLink': 'http://en.wikipedia.org/Category:Science', 'score': 0.02951}
{'id': 5, 'label': 'Violence', 'wikiLink': 'http://en.wikipedia.org/Category:Violence', 'score': 0.02375}
------------------
Coarse Topics:


In [116]:
json = response.json  # << all information ??

In [118]:
# ['entailments', 'relations', 'dependency-trees', "entities", "topics", "words","phrases"] >> 2513498
# ['entailments', 'relations', "entities", "topics", "words","phrases"] >> 2513498
# ['entailments', "entities", "topics", "words","phrases"] >> 2430649
len(str(json))  

2430649

In [111]:
json

{'ok': True,
 'response': {'categories': [{'categoryId': '15008001',
    'classifierId': 'textrazor_newscodes',
    'id': 0,
    'label': 'sport>basketball>national basketball association (north american professional)',
    'score': 0.6242},
   {'categoryId': '15000000',
    'classifierId': 'textrazor_newscodes',
    'id': 1,
    'label': 'sport',
    'score': 0.6139},
   {'categoryId': '15008000',
    'classifierId': 'textrazor_newscodes',
    'id': 2,
    'label': 'sport>basketball',
    'score': 0.6134},
   {'categoryId': '10001000',
    'classifierId': 'textrazor_newscodes',
    'id': 3,
    'label': 'lifestyle and leisure>game',
    'score': 0.5317},
   {'categoryId': '07009000',
    'classifierId': 'textrazor_newscodes',
    'id': 4,
    'label': 'health>injury',
    'score': 0.531},
   {'categoryId': '15003001',
    'classifierId': 'textrazor_newscodes',
    'id': 5,
    'label': 'sport>american football>(us) national football league (nfl) (north american)',
    'score': 0.3554}

## Entailments

## Relations

## Sentence - dependency-trees

## Words Phrases

In [47]:
for np in response.noun_phrases():
    print(to_analyze[np.words[0].input_start_offset: np.words[-1].input_end_offset])

6
Wins
Playoffs
4 weeks
8 weeks
44.1
Finals
42.2
53
Finals
42
6
5
5 weeks
No time
43.6
46.7
51
6
68%
6 weeks
43.1
7%
48
4%
6
1 week
7 weeks
Time
Curry returns
full strength
46.1
42.6
65
46
7
6
62
2 weeks
7
45.5
3 weeks
44.5
55
Our model
guard Klay Thompson
the playoffs
All-Stars
Draymond Green and D
Angelo Russell
that
the team
probability
the playoffs
50 percent
he
five weeks
time
Golden State
only 43 wins
you
things
any
the Golden State Warriors
who
the wrong end
two blowouts
their first week
play
a third
Wednesday
the floor
the new Chase Center
the most basic level
the injury
a struggling team
that
it
any other team
these past few seasons
Curry
impact
Rotation pieces
Willie Cauley-Stein and Alec Burks
the lineup
the Warriors
center Kevon Looney
neuropathy
some point
all this
mind
how much time
Curry
you
speculation
Russell
about Green
considerable rest
the notion
the Warriors
things
Curry
one
the third quarter
Stephen Curry
his left
transition
a layup
Phoenix Suns center Aron Baynes

## Entities

In [39]:
test = response.entities()[0]
test.dbpedia_types

['Agent', 'Person', 'Athlete', 'BasketballPlayer']

In [30]:
entities = list(response.entities())
entities.sort(key=lambda x: x.relevance_score, reverse=True)
seen = set()

for entity in entities:
    if entity.id not in seen:
        print('entity.id: ', entity.id, 
              ' || entity.relevance_score: ', entity.relevance_score, 
              ' || entity.confidence_score: ', entity.confidence_score, 
              ' || entity.freebase_types: ', entity.freebase_types)
        print('--'*20)
        seen.add(entity.id)

entity.id:  Golden State Warriors  || entity.relevance_score:  0.879  || entity.confidence_score:  111.3  || entity.freebase_types:  ['/sports/sports_team', '/basketball/basketball_team', '/award/award_winner', '/tv/tv_actor', '/tv/tv_subject', '/organization/organization', '/award/award_nominee', '/business/employer', '/sports/professional_sports_team']
----------------------------------------
entity.id:  Los Angeles Clippers  || entity.relevance_score:  0.8687  || entity.confidence_score:  79.07  || entity.freebase_types:  ['/sports/professional_sports_team', '/sports/sports_team', '/tv/tv_subject', '/basketball/basketball_team']
----------------------------------------
entity.id:  Stephen Curry  || entity.relevance_score:  0.8388  || entity.confidence_score:  11.4  || entity.freebase_types:  ['/people/measured_person', '/award/award_winner', '/sports/pro_athlete', '/basketball/basketball_player', '/people/person', '/award/award_nominee']
----------------------------------------
enti

In [20]:
for topic in response.topics():
    if topic.score > 0.3:
        print(topic.label)

Golden State Warriors
San Antonio Spurs
Klay Thompson
Sports teams
Basketball Association of America teams
Basketball
Men's sports
Ball games
Sports organizations
Sports team relocations
Athletic sports
Team sports
National Basketball Association teams
Relocated National Basketball Association teams
Professional sports leagues
Basketball teams
National Basketball Association
Sports
Los Angeles Clippers
National Basketball Association seasons
Stephen Curry
New York Knicks
Steve Kerr
Basketball Association of America
David Robinson
Blake Griffin
Utah Jazz
Detroit Pistons
Basketball leagues
Atlanta Hawks
Tim Duncan
Los Angeles Lakers
National Basketball Association games
Charlotte Hornets
Sports organizations of the United States
Point guard
Phoenix Suns
Sports competitions
NBA draft lottery
Draymond Green
National Basketball Association playoffs
Trae Young
Sports events
Kevon Looney
National Basketball Association players
Playoffs
Cleveland Cavaliers
Men's basketball
National Basketball 

In [120]:
for category in response.categories():
    print(category.category_id, category.label, category.score)

15008001 sport>basketball>national basketball association (north american professional) 0.6229
15000000 sport 0.6154
15008000 sport>basketball 0.612
10001000 lifestyle and leisure>game 0.5311
07009000 health>injury 0.5222
15003001 sport>american football>(us) national football league (nfl) (north american) 0.3579
15003000 sport>american football 0.3339
15073040 sport>sports event>national tournament 0.3298
15054000 sport>soccer 0.3295


In [122]:
category.json

{'categoryId': '15054000',
 'classifierId': 'textrazor_newscodes',
 'id': 8,
 'label': 'sport>soccer',
 'score': 0.3295}