# Cowboys News Topic Modeling

In [1]:
import json
from pprint import pprint

with open('cowboy_news.json', encoding='utf-8') as f:
    df = json.load(f)
df = json.loads(df)

In [2]:
df[0]

{'date': '2018-08-22 00:00:00',
 'keywords': ['frederick',
  'treatments',
  'cowboys',
  'youre',
  'dallas',
  'game',
  'field',
  'reveals',
  'center',
  'disease',
  'timetable',
  'hes',
  'jones',
  'travis',
  'return',
  'spine',
  'rare'],
 'text': 'Dallas Cowboys center Travis Frederick is seen during the afternoon practice at the training camp in Oxnard, Calif., Saturday, Aug. 4, 2018. (Jae S. Lee/The Dallas Morning News)\n\n"In the last 48 hours, I have received two treatments that address my condition, and I am feeling much better from an overall strength perspective," Frederick said in the statement. "I will continue these treatments over the next few days. I am very optimistic about my condition and the immediate future, as I have been told that the illness was detected at a fairly early stage."\n\nThe Cowboys\' All-Pro center released a statement Wednesday revealing that he has been diagnosed with Guillain-Barre Syndrome, a rare auto-immune disease that causes the bod

In [3]:
import re

def preprocess_post(post):
    post = post.lower()
    post = re.sub('\n', ' ', post)
    post = re.sub(r'[^\w\s]','',post)
    
    return post

In [4]:
sent = []
for post in df:
    k = preprocess_post(post['text'])
    sent.append(k)

In [5]:
sent[0]

'dallas cowboys center travis frederick is seen during the afternoon practice at the training camp in oxnard calif saturday aug 4 2018 jae s leethe dallas morning news  in the last 48 hours i have received two treatments that address my condition and i am feeling much better from an overall strength perspective frederick said in the statement i will continue these treatments over the next few days i am very optimistic about my condition and the immediate future as i have been told that the illness was detected at a fairly early stage  the cowboys allpro center released a statement wednesday revealing that he has been diagnosed with guillainbarre syndrome a rare autoimmune disease that causes the bodys immune system to attack its nerves  an update on whats going on with me pictwittercomkskmuvycwk  frederick added that his doctors have said its currently not possible to determine when he will be able to return to the field according to the mayo clinic theres no known cure for guillainbar

In [6]:
len(sent)

31

In [7]:
tx = sent[:25] # train
tex = sent[26:] # test

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(tx)
X = cv.transform(tx)
X_test = cv.transform(tex)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 15 else 0 for i in range(25)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.714285714286
Accuracy for C=0.05: 0.714285714286
Accuracy for C=0.25: 0.714285714286
Accuracy for C=0.5: 0.714285714286
Accuracy for C=1: 0.857142857143




In [10]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)

LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)

('said', 0.06780972645426632)
('2018', 0.047760845099973664)
('take', 0.043781333260431959)
('back', 0.04258163393106839)
('news', 0.042354947157079235)


In [12]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('see', -0.047914852203324282)
('follow', -0.040716677374939211)
('video', -0.040657776328493496)
('well', -0.036820027417992333)
('too', -0.035762820851556409)
