In [None]:
import pandas, json, numpy, requests, os, datetime, pytz, tweepy, sqlite3, time, re, random, matplotlib.pyplot as plt, sklearn, statsmodels.api as sm
from bs4 import BeautifulSoup
from scipy.signal import find_peaks

This script is for giving commonly found words a value.

The overall plan is to replace all words in a sentence with a value, which will be added to give a net value for the sentence. All other uncommon words will be ignored. Only headlines or snippets of New York Times articles with the company name in them will be considered due to many of the articles being irrelevant to the company. All google search results will be considered.

NOTE: THIS SCRIPT REQUIRES MANUALLY ASSIGNING VALUES TO EACH WORD USING MICROSOFT EXCEL!!!!

In [None]:
### Copying the method to obtain symbols and company names: (Copied from Stats_Linear_Regression_Fit_Moving_Average_Pattern.py)
NYSE_csv = pandas.read_csv('NYSE.txt', sep="\t", header=0).set_index('Symbol')

AMEX_csv = pandas.read_csv('AMEX.txt', sep="\t", header=0).set_index('Symbol')

stock_exchange_ticks_and_names = pandas.merge(NYSE_csv.reset_index(), AMEX_csv.reset_index(), how='outer')
stock_exchange_ticks_and_names.to_csv('merged_NYSE_AMEX.csv')
stock_exchange_ticks_and_names_copy = stock_exchange_ticks_and_names.copy().dropna()

regex1 = re.compile('[@_!#$%^&*()<>?/\|}{~:[\].]')
regex2 = re.compile('Cl ')

stock_exchange_ticks_and_names_removed = pandas.DataFrame()

for x, y in stock_exchange_ticks_and_names_copy.iterrows():

    if bool(regex1.search(y['Description'])) == False and bool(regex2.search(y['Description'])) == False and bool(regex1.search(y['Symbol'])) == False:
        stock_exchange_ticks_and_names_removed.loc[x, 'Symbol'] = y['Symbol']
        stock_exchange_ticks_and_names_removed.loc[x, 'Description'] = y['Description']

stock_indices_df = pandas.DataFrame({'Description': ['S&P', 'Dow', "Nasdaq"], 'Symbol': ['.INX', '.DJI', ".IXIC"]})

stocks_and_names_with_indices = pandas.concat([stock_exchange_ticks_and_names_removed, stock_indices_df])
stocks_and_names_with_indices = stocks_and_names_with_indices.set_index('Symbol')
stocks_and_names_with_indices = stocks_and_names_with_indices.reset_index()
stocks_and_names_with_indices.to_csv('merged_NYSE_AMEX_word_extraction.csv')

In [None]:
def word_extraction(name):
    ### Loading in NY Times and Google data:
    times_df = pandas.read_csv(os.getcwd() + '\\Historical Articles\\' + name + '.csv')
    google_df = pandas.read_csv(os.getcwd() + '\\Google Search - News\\' + name + '.csv')
    
    ### Finding the most common words in NY Times:
    times_all_words_list = []
    times_articles_count = 0

    for x, y in times_df.iterrows():
        u = None
        if name in str(y['snippet']):
            u = 'snippet'
        elif name in str(y['abstract']):
            u = 'abstract'
        elif name in str(y['headline']):
            u = 'headline'

        if u is None:
            continue

        elif type(u) == str:
            string_extraction = y[u]

        u = None

        try:
            string_extraction_replaced = string_extraction.replace(name, '')
        
            string_extraction_replaced = string_extraction_replaced('(' + symbol + ')', '')
        
        except Exception:
            pass

        string_extraction_split = string_extraction.split()

        string_extraction_unique = list(set(string_extraction_split))

        times_all_words_list.extend(string_extraction_unique)

        times_articles_count += 1

    print('Number of Times Articles Extracted: ', times_articles_count)
    
    ### Finding the most common words in Google News Web Search Results:
    google_all_words_list = []
    google_articles_count = 0

    for x, y in google_df.iterrows():
        string_extraction = y['headline'] + ' ' + y['snippet']

        try:
            string_extraction_replaced = string_extraction.replace(name, '')
        
            string_extraction_replaced = string_extraction_replaced('(' + symbol + ')', '')
        
        except Exception:
            pass

        string_extraction_split = string_extraction_replaced.split()

        string_extraction_unique = list(set(string_extraction_split))

        google_all_words_list.extend(string_extraction_unique)

        google_articles_count += 1

    print('Number of Google Search Results Extracted: ', google_articles_count)
    
    return times_all_words_list, google_all_words_list

In [None]:
def words_counting(times_all_words_list, google_all_words_list):
    # Counting the times the words appeared uniquely:
    combined_words_list = []
    combined_words_list.extend(google_all_words_list)
    combined_words_list.extend(times_all_words_list)
    combined_unique_list = list(set(combined_words_list))
    combined_df = pandas.DataFrame()

    for x, y in enumerate(combined_unique_list):    
        combined_df.loc[x, 'Word'] = y
        combined_df.loc[x, 'Count'] = combined_words_list.count(y)
        
    return combined_df


In [None]:
# Looping through all stocks:
for u, v in stocks_and_names_with_indices.iterrows():
    try:
        times_all_words_list, google_all_words_list = word_extraction(v['Description'])
    except Exception:
        continue
    combined_df = words_counting(times_all_words_list, google_all_words_list)
    temp_df = pandas.DataFrame()

    if u == 0:
        total_df = combined_df.copy()
    
    else:
        for i, t in combined_df.iterrows():
            if bool(type(total_df.Count[total_df.Word == t.Word]) == pandas.Series) == True:
                total_df.Count[total_df.Word == t.Word] = total_df.Count[total_df.Word == t.Word] + t.Count
            else:
                total_df = pandas.concat([total_df, {'Word': t.Word, 'Count': t.Count}])

                    

In [None]:
# Exporting the total word counts in all Times and Google results:
total_df.to_csv('All Word Extraction Counts.csv')

In [None]:
### Graphing the counts:
plt.hist(total_df.Count, bins= 70)
plt.show()

____Manually Assinging Values______

For this part, I will be assigning two values to each word: a Sentence_Value and a Importance_Value.

Sentence_Value = the value the word will contribute to the net value of the sentence (sell, buy, drop, increase, etc.)
Sentence values are additions, in the range of -10 to +10

Importance_Value = the value the word means for the stock in general (earnings, bankruptcy, etc.)
Importance values are multipliers, in the range of 0 to +10

Example:
Sentence = the stock will increase in price due to earnings
Conversion = 0 0 0 +5 0 (+ 0, *2) 0 0 (+0, *10)
Sentence Net Value = 5 * (10 + 2) =  60
Here, the word increase is has a sentence value of +5, while price and earnings have 0 sentence values but instead, add to the multiplier of the overall net value.

In [None]:
### Removing all text with 0 values for Sentence_Value and Importance_Value.
total_df_valuated = pandas.read_csv('All Word Extraction Counts - Valued.csv')

no_value_index_list = list(total_df_valuated.index[(total_df_valuated.Sentence_Value == 0) & (total_df_valuated.Importance_Value == 0)])

total_df_valuated_removed = total_df_valuated.drop(no_value_index_list).sort_values('Word').reset_index()

In [None]:
total_df_valuated_removed.to_csv('Only Valued Words.csv')

In [None]:
### Creating a DataFrame with all the text that contain the company name, then using the manual valuation as the key to 
### to assigning values. This is for checking to see how the manual valuation performs. The first part of the definition is 
### copied from the word_extraction definition.
def text_valuate(symbol, name):
    ### Loading in NY Times and Google data:
    times_df = pandas.read_csv(os.getcwd() + '\\Historical Articles\\' + name + '.csv')
    google_df = pandas.read_csv(os.getcwd() + '\\Google Search - News\\' + name + '.csv')
    
    print(symbol)
    
    text_list = []
    text_valuation_list = []
    sentence_value_list = []
    importance_value_list = []
    
    ### Finding the most common words in NY Times:
    times_articles_count = 0
    
    for x, y in times_df.iterrows():
        u = None
        if name in str(y['snippet']):
            u = 'snippet'
        elif name in str(y['abstract']):
            u = 'abstract'
        elif name in str(y['headline']):
            u = 'headline'

        if u is None:
            continue

        elif type(u) == str:
            string_extraction = y[u]

        u = None

        string_extraction_split = string_extraction.split()

        sentence_value = 0
        importance_value = 0
        
        for u in range(len(string_extraction_split)):
            for c, v in total_df_valuated_removed.iterrows():
                if bool(string_extraction_split[u] == v.Word) == True:
                    sentence_value = sentence_value + v.Sentence_Value
                    importance_value = importance_value + v.Importance_Value

        text_list.append(string_extraction)      
        net_sentence_value = sentence_value * importance_value
        text_valuation_list.append(net_sentence_value)
        sentence_value_list.append(sentence_value)
        importance_value_list.append(importance_value)
        times_articles_count += 1

    print('Number of Times Articles Extracted: ', times_articles_count)
    print('Length of Times Articles Extracted: ', len(text_list), "; Times Valuation List: ", len(text_valuation_list))
    
    google_articles_count = 0
    for x, y in google_df.iterrows():
        string_extraction = y['headline'] + ' ' + y['snippet']

        string_extraction_split = string_extraction.split()
        
        string_extraction_unique = list(set(string_extraction_split))  ## This remains because the headline and snippet might be the same

        sentence_value = 0
        importance_value = 0
        
        for u in range(len(string_extraction_unique)):
            for c, v in total_df_valuated_removed.iterrows():
                if bool(string_extraction_split[u] == v.Word) == True:
                    sentence_value = sentence_value + v.Sentence_Value
                    importance_value = importance_value + v.Importance_Value

        text_list.append(string_extraction)                    
        net_sentence_value = sentence_value * importance_value
        text_valuation_list.append(net_sentence_value)
        sentence_value_list.append(sentence_value)
        importance_value_list.append(importance_value)
        google_articles_count += 1                    
                
    print('Number of Google Search Results Extracted: ', google_articles_count)
    print('Length of Times & Google Articles Extracted: ', len(text_list), "; Times & Google Valuation List: ", len(text_valuation_list))

    return text_list, sentence_value_list, importance_value_list, text_valuation_list

In [None]:
# Looping through all stocks:
for u, v in stocks_and_names_with_indices.iterrows():
    try:
        text_list, sentence_value_list, importance_value_list, text_valuation_list = text_valuate(v['Symbol'], v['Description'])

        text_dict = {'Text': text_list, 'Total_Sentence_Value': sentence_value_list, 'Total_Importance_Value': importance_value_list, 'Net_Sentence_Value': text_valuation_list}
        temp_df = pandas.DataFrame(text_dict)

        if u == 0:
            text_df = temp_df.copy()

        else:
            text_df = pandas.concat([text_df, temp_df])
    except Exception:
        pass

In [None]:
text_df.to_csv('Valued Articles.csv')

### Manually checking to see if any the word valuation gave importance to relevant text: 
### Most of the text are valued good enough as intended.

In [None]:
### Modifying text_valuate to add the Sentence_Value, Importance_Value, and Text_value (Net_Sentence_Value) to the .csv
### file that contains the text information:
def text_valuate_csv_modify(symbol, name):
    ### Loading in NY Times and Google data:
    times_df = pandas.read_csv(os.getcwd() + '\\Historical Articles\\' + name + '.csv')
    google_df = pandas.read_csv(os.getcwd() + '\\Google Search - News\\' + name + '.csv')
    total_df_valuated_removed = pandas.read_csv('Only Valued Words.csv')
    
    print(symbol)
    
    ### Finding the most common words in NY Times:
    times_articles_count = 0
    
    for x, y in times_df.iterrows():
        sentence_value = 0
        importance_value = 0
        text_value = 0

        try:
            if 'Text_Value' in times_df.columns:  ## This is to speed up the script if the text has been evaluated before
                if times_df.loc[x, 'Sentence_Value'] > 0:
                    continue

        except Exception:
            pass

        u = None

        if name in str(y['snippet']):
            u = 'snippet'
        elif name in str(y['abstract']):
            u = 'abstract'
        elif name in str(y['headline']):
            u = 'headline'

        if u is None:
            times_df.loc[x, 'Sentence_Value'] = 0
            times_df.loc[x, 'Importance_Value'] = 0
            times_df.loc[x, 'Text_Value'] = 0            
            continue

        string_extraction_split = y[u].split()

        for u in range(len(string_extraction_split)):
            for c, v in total_df_valuated_removed.iterrows():
                if bool(string_extraction_split[u] == v.Word) == True:
                    sentence_value = sentence_value + v.Sentence_Value
                    importance_value = importance_value + v.Importance_Value

        times_df.loc[x, 'Sentence_Value'] = sentence_value
        times_df.loc[x, 'Importance_Value'] = importance_value
        times_df.loc[x, 'Text_Value'] = sentence_value * importance_value

        times_articles_count += 1

    print('Number of Times Articles Extracted: ', times_articles_count)

    google_articles_count = 0
    for x, y in google_df.iterrows():

        if 'Text_Value' in google_df.columns:  ## This is to speed up the script if the text has been evaluated before; Remove if word values df is changed
            if google_df.loc[x, 'Sentence_Value'] > 0:
                continue

        string_extraction = y['headline'] + ' ' + y['snippet']

        string_extraction_split = string_extraction.split()

        string_extraction_unique = list(set(string_extraction_split))  ## This remains because the headline and snippet might be the same

        sentence_value = 0
        importance_value = 0
        for u in range(len(string_extraction_unique)):
            for c, v in total_df_valuated_removed.iterrows():
                if bool(string_extraction_split[u] == v.Word) == True:
                    sentence_value = sentence_value + v.Sentence_Value
                    importance_value = importance_value + v.Importance_Value

        google_df.loc[x, 'Sentence_Value'] = sentence_value
        google_df.loc[x, 'Importance_Value'] = importance_value
        google_df.loc[x, 'Text_Value'] = sentence_value * importance_value

        google_articles_count += 1                    

    print('Number of Google Search Results Extracted: ', google_articles_count)
    
    times_df.to_csv(os.getcwd() + '\\Historical Articles\\' + name + '.csv')
    google_df.to_csv(os.getcwd() + '\\Google Search - News\\' + name + '.csv')

In [None]:
# Looping through all stocks:
for u, v in stocks_and_names_with_indices.iloc[2000:].iterrows():
    try:
        text_valuate_csv_modify(v['Symbol'], v['Description'])

    except Exception:
        print('Error: ', v['Description'])
        pass