In [4]:
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, metrics
import numpy as np

In [5]:
data = pd.read_csv('data.csv')
data['source'].value_counts()

Reuters                603
Fox News               480
Breitbart News         455
CNN                    393
ABC News               358
BBC News               309
The Hill               266
Independent            255
CBS News               232
The Verge              228
The New York Times     141
NBC News               101
Al Jazeera English      99
Metro                   40
Daily Mail              40
Mirror                  37
The Huffington Post      8
Name: source, dtype: int64

In [3]:
data = data.drop(columns=['Unnamed: 0'],axis=1)
data = data.dropna()
data['target'] = np.where(data['source']=='Reuters', 1, 0)
data = data.drop(columns='source', axis=1) # removing leakage
data = data.drop(columns='author', axis=1) # removing leakage
data = data.drop(columns='url', axis=1) # removing leakage

In [4]:
data

Unnamed: 0,title,description,requested_date,publishedAt,content,target
0,Could this be Samsung’s bezel-less 8K TV?,"Samsung may have a new, groundbreaking type of...",2019-12-31,2019-12-31T22:48:35Z,Filed under:\r\nThe company is rumored to show...,0
1,TikTok claims zero takedown requests from Chin...,"TikTok, owned by Chinese tech giant ByteDance,...",2019-12-31,2019-12-31T22:39:55Z,The highest number of requests came from India...,0
2,Google will finally stop using controversial I...,The company saved tens of billions of dollars ...,2019-12-31,2019-12-31T20:11:26Z,Regulations will end the Double Irish and Dutc...,0
3,New Apple patent imagines virtual speakers tha...,Apple has just been granted a patent — concern...,2019-12-31,2019-12-31T19:02:11Z,Apple has filed patents for the tech in both h...,0
4,Smoke app brings parts of Valve’s Steam to the...,A new third-party app called Smoke will bring ...,2019-12-31,2019-12-31T19:00:00Z,Check in on what friends are playing\r\nFor pe...,0
...,...,...,...,...,...,...
4041,Exclusive: Jim Banks Slams Ilhan Omar for Gigg...,Rep. Jim Banks (R-IN) slammed Rep. Ilhan Omar ...,2020-01-09,2020-01-09T20:11:18Z,Rep. Jim Banks (R-IN) slammed far-left “Squad”...,0
4042,"Farage: Harry 'Let the Side Down', Compares Me...",Brexit Party leader Nigel Farage believes Prin...,2020-01-09,2020-01-09T19:51:56Z,Brexit Party leader Nigel Farage believes Prin...,0
4043,IT BEGINS: Peter Schweizer Book Set to ‘Upend ...,"Peter Schweizer, author of bestsellers Clinton...",2020-01-09,2020-01-09T19:11:10Z,The investigative author behind the game-chang...,0
4044,Book to Reveal How Biden Family Siphoned ‘Mill...,A bombshell book will reveal how five members ...,2020-01-09,2020-01-09T19:10:31Z,A forthcoming bombshell book from publishing g...,0


In [5]:
data = data.sample(frac=1)

In [6]:
data

Unnamed: 0,title,description,requested_date,publishedAt,content,target
2019,Five lingering questions as impeachment heads ...,Speaker Nancy Pelosi (D-Calif.) on Friday ende...,2020-01-13,2020-01-12T11:00:11Z,Speaker Nancy PelosiNancy PelosiSunday shows p...,0
54,US prison cadets fired over Nazi salute photo,All the West Virginia cadets pictured making t...,2019-12-31,2019-12-31T15:17:13Z,Image copyrightWest Virginia Department of Mil...,0
2722,Meghan's father 'would testify in privacy case',The Duchess of Sussex's half sister tells the ...,2020-01-15,2020-01-15T13:02:55Z,Image copyrightDaily Mail/Solo SyndicationImag...,0
1183,France discusses Middle East tensions with Ger...,French Foreign Minister Jean-Yves Le Drian sai...,2020-01-04,2020-01-04T13:05:16Z,PARIS (Reuters) - French Foreign Minister Jean...,1
1992,Geopolitical 'Jeopardy!': Game show faces crit...,Game show 'Jeopardy!' faced criticism online a...,2020-01-13,2020-01-12T14:30:00Z,There seemed to be just $200 on the line when ...,0
...,...,...,...,...,...,...
2941,Donald Trump Signs Phase One Trade Deal with C...,President Donald Trump hosted a signing of the...,2020-01-15,2020-01-15T18:16:12Z,President Donald Trump hosted a signing of the...,0
2652,"Shaheen Bagh protesters vow to fight, seek rol...","One month on, protesters say they will continu...",2020-01-15,2020-01-15T05:16:00Z,"New Delhi, India - Bilquis has been on a sit-i...",0
322,As Virginia governor vows new gun control push...,Backed by Virginia's first Democratic-controll...,2020-01-10,2020-01-10T20:34:28Z,(Reuters) - Backed by Virginia’s first Democra...,1
624,Iraq: Anti-government protesters denounce pro-...,Anti-gov't protesters in Baghdad accuse author...,2020-01-02,2020-01-02T20:58:00Z,Anti-government rallies continued in Iraq's ca...,0


In [7]:
valid_fraction=0.2
valid_rows = int(len(data) * valid_fraction)
train = data[:-valid_rows]
test = data[-valid_rows:]

In [8]:
count_vectorizer = feature_extraction.text.CountVectorizer()
example_train_vectors = count_vectorizer.fit_transform(train["title"][0:5])
print(example_train_vectors[0].todense().shape)
print('')
print(example_train_vectors[0].todense())

(1, 44)

[[0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0
  0 0 1 1 0 0 0 0]]


In [9]:
test.drop(['target'], axis=1)

Unnamed: 0,title,description,requested_date,publishedAt,content
92,Militia Leader Who Led Raid on U.S. Embassy a ...,"Iranian militia leader Hadi al-Amiri, one of s...",2019-12-31,2019-12-31T19:49:44Z,"Iranian militia leader Hadi al-Amiri, one of s..."
1742,Biden uses Trump's own words to make his case ...,Former Vice President Joe Biden is using Presi...,2020-01-14,2020-01-14T10:31:34Z,(CNN)Former Vice President Joe Biden is using ...
3397,Execs accuse McDonald's of racial discrimination,"Two women are taking legal action, claiming a ...",2020-01-08,2020-01-08T17:12:52Z,Image copyrightGetty ImagesImage caption\r\n M...
1089,"Conservative, liberal Methodists to split over...",The United Methodist Church plans to split int...,2020-01-04,2020-01-03T22:42:08Z,(Reuters) - The United Methodist Church plans ...
536,weather forecast: US storms bring power outage...,A triple threat storm continues to move across...,2020-01-11,2020-01-11T16:22:35Z,A firefighter and a police officer were killed...
...,...,...,...,...,...
2941,Donald Trump Signs Phase One Trade Deal with C...,President Donald Trump hosted a signing of the...,2020-01-15,2020-01-15T18:16:12Z,President Donald Trump hosted a signing of the...
2652,"Shaheen Bagh protesters vow to fight, seek rol...","One month on, protesters say they will continu...",2020-01-15,2020-01-15T05:16:00Z,"New Delhi, India - Bilquis has been on a sit-i..."
322,As Virginia governor vows new gun control push...,Backed by Virginia's first Democratic-controll...,2020-01-10,2020-01-10T20:34:28Z,(Reuters) - Backed by Virginia’s first Democra...
624,Iraq: Anti-government protesters denounce pro-...,Anti-gov't protesters in Baghdad accuse author...,2020-01-02,2020-01-02T20:58:00Z,Anti-government rallies continued in Iraq's ca...


In [10]:
train_vectors = count_vectorizer.fit_transform(train["title"])
test_vectors = count_vectorizer.transform(test["title"])

In [11]:
clf = linear_model.RidgeClassifier()

In [12]:
scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1")
scores

array([0.49253731, 0.48951049, 0.48351648])

In [13]:
clf.fit(train_vectors, train["target"])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [14]:
test["prediction"] = clf.predict(test_vectors)
test['correct'] = np.where(test['target']==test['prediction'], 'correct', 'incorrect')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
test['correct'].value_counts()

correct      556
incorrect     90
Name: correct, dtype: int64

In [16]:
target_only = test[test.target == 1]
target_only['correct'].value_counts()

correct      71
incorrect    56
Name: correct, dtype: int64

In [None]:
test.to_csv('predictions.csv')

In [6]:
data = data.sample(frac=0.01)

In [7]:
data.to_csv('sample_train.csv')

In [11]:
while True:
    print('Do you want to save the prediction?')
    decision = input('Enter Y or N').upper()
    if decision == 'Y':
        print('Saving...')
        test.to_csv('predictions.csv')
        break
    elif decision =='N':
        print('Exiting')
        break

Do you want to save the prediction?
Enter Y or Ny
Saving...
