# Combines [Financial News Sentiment](https://www.kaggle.com/ankurzing/sentiment-analysis-for-financial-news), [Sentiment140](https://www.kaggle.com/kazanova/sentiment140) and NYT sentiment data.

# Exports as .csv

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/news_data')

Financial News Sentiment

In [None]:
import pandas as pd
finance_data = pd.read_csv('all-data.csv', encoding='latin-1', header=None)

In [None]:
finance_data.head()

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [None]:
finance_data = finance_data.rename({0:'Sentiment', 1:'Body'}, axis=1)

In [None]:
finance_data = finance_data.loc[:, ['Body', 'Sentiment']]

In [None]:
finance_data['Sentiment'] = finance_data['Sentiment'].map(lambda x:x.upper())

In [None]:
finance_data.head()

Unnamed: 0,Body,Sentiment
0,"According to Gran , the company has no plans t...",NEUTRAL
1,Technopolis plans to develop in stages an area...,NEUTRAL
2,The international electronic industry company ...,NEGATIVE
3,With the new production plant the company woul...,POSITIVE
4,According to the company 's updated strategy f...,POSITIVE


Tweet Sentiment140

In [None]:
tweet_data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)

In [None]:
tweet_data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
tweet_mapping = {0:'NEGATIVE', 2:'NEUTRAL', 4:'POSITIVE'}
tweet_data = tweet_data.rename({0:'Sentiment', 5:'Body'}, axis=1)
tweet_data['Sentiment'] = tweet_data['Sentiment'].map(tweet_mapping)
tweet_data = tweet_data.loc[:, ['Body', 'Sentiment']]

In [None]:
tweet_data.head()

Unnamed: 0,Body,Sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",NEGATIVE
1,is upset that he can't update his Facebook by ...,NEGATIVE
2,@Kenichan I dived many times for the ball. Man...,NEGATIVE
3,my whole body feels itchy and like its on fire,NEGATIVE
4,"@nationwideclass no, it's not behaving at all....",NEGATIVE


NYT News with Sentiment

In [None]:
news_data = pd.read_csv('big-exported-data.csv', encoding='latin-1')

In [None]:
def clean_by_score(data, threshold=0.8):
  for sentiment, score in {"MIXED":"ScoreMixed", "NEGATIVE":"ScoreNegative", "NEUTRAL":"ScoreNeutral", "POSITIVE":"ScorePositive"}.items():
    data.drop(data[(data["Sentiment"] == sentiment) & (data[score] < threshold)].index, axis=0, inplace=True)
    data.reset_index()
  return data

In [None]:
news_data = clean_by_score(news_data)

In [None]:
news_data['Sentiment'].value_counts()

NEUTRAL                                                                                                                               2710
POSITIVE                                                                                                                                88
NEGATIVE                                                                                                                                53
MIXED                                                                                                                                   28
A                                                                                                                                       18
                                                                                                                                      ... 
 and pushed the diplomatic process back to somewhere near square one. How far the Mladic strategy is shared by other leading Serbs       1
 while challengers have onl

In [None]:
# This code block removes the 'MIXED' articles and other import errors from the news data
news_mapping = {'NEGATIVE':'NEGATIVE', 'NEUTRAL':'NEUTRAL', 'POSITIVE':'POSITIVE'}
news_data["Sentiment"] = news_data["Sentiment"].map(news_mapping)
news_data = news_data.dropna()

In [None]:
news_data['Sentiment'].value_counts()

NEUTRAL     2703
POSITIVE      88
NEGATIVE      52
Name: Sentiment, dtype: int64

In [None]:
news_data = news_data.loc[:, ['Body', 'Sentiment']]

In [None]:
news_data.head()

Unnamed: 0,Body,Sentiment
0,LEAD: IN the agency world there are the conglo...,NEUTRAL
1,"LEAD: System Energy Resources Inc., a unit of ...",NEUTRAL
2,LEAD: *3*** COMPANY REPORTS ** *3*FHP INTERNAT...,NEUTRAL
3,LEAD: *3*** COMPANY REPORTS ** *3*GROWTH VENTU...,NEUTRAL
4,"LEAD: Claire F. O'Brien, a lawyer, and Timothy...",NEUTRAL


Combine all three datasets and export

In [None]:
combined_data = pd.concat([news_data, finance_data, tweet_data], axis=0, ignore_index=True)

In [None]:
combined_data.head()

Unnamed: 0,Body,Sentiment
0,LEAD: IN the agency world there are the conglo...,NEUTRAL
1,"LEAD: System Energy Resources Inc., a unit of ...",NEUTRAL
2,LEAD: *3*** COMPANY REPORTS ** *3*FHP INTERNAT...,NEUTRAL
3,LEAD: *3*** COMPANY REPORTS ** *3*GROWTH VENTU...,NEUTRAL
4,"LEAD: Claire F. O'Brien, a lawyer, and Timothy...",NEUTRAL


In [None]:
combined_data.shape

(1607689, 2)

In [None]:
combined_data.to_csv('combined_data.csv', index=False, header=True)

In [None]:
exported_data = pd.read_csv('combined_data.csv')

In [None]:
exported_data.head()

Unnamed: 0,Body,Sentiment
0,LEAD: IN the agency world there are the conglo...,NEUTRAL
1,"LEAD: System Energy Resources Inc., a unit of ...",NEUTRAL
2,LEAD: *3*** COMPANY REPORTS ** *3*FHP INTERNAT...,NEUTRAL
3,LEAD: *3*** COMPANY REPORTS ** *3*GROWTH VENTU...,NEUTRAL
4,"LEAD: Claire F. O'Brien, a lawyer, and Timothy...",NEUTRAL
