I am trying to label the final_dataset.csv file as positive or negative based on if the news affected the market positively or negatively. I will create a column for each stock and compare the stock price at that day with the previous day. If the stock price increased, I will label it as positive; if it decreased, I will label it as negative.

In [1]:
import pandas as pd

final_datapath = "data/final_dataset.csv"

data = pd.read_csv(final_datapath)

data

Unnamed: 0,Date,Disney,Microsoft,Walmart,Google,Exxon Mobil,Apple,Intel,JP Morgan,Johnson & Johnson,...,NASDAQ Composite,Dow Jones,S&P 500,Fed Funds Rate,Oil,Gold,10Y Treasury,EUR-USD,USD-JPY,news_text
0,2004-08-19,17.806047,16.771729,12.069075,2.496041,22.676308,0.461482,12.699708,21.737772,31.452284,...,1819.890015,10040.820312,1091.229980,1.4420,48.700001,407.100006,4.211,1.237195,109.360001,Democrats' Legal Challenges Impede Nader Campa...
1,2004-08-20,17.837843,16.821201,12.022876,2.694301,22.756765,0.462834,12.474672,22.109600,31.596300,...,1838.020020,10110.139648,1098.349976,1.4540,47.860001,413.200012,4.231,1.232195,109.110001,7 More Managers Fired Over Nortel Accounting W...
2,2004-08-23,17.623217,16.895638,11.835877,2.721417,22.626007,0.467042,12.630469,22.052399,31.601843,...,1838.699951,10073.049805,1095.680054,1.4770,46.049999,410.500000,4.279,1.215200,109.739998,World Briefing | Africa: Uganda: 68 Percent In...
3,2004-08-24,17.734505,16.895638,11.849078,2.608728,22.555599,0.480116,12.503527,22.075277,31.668293,...,1836.890015,10098.629883,1096.189941,1.5170,45.209999,403.000000,4.283,1.208196,109.690002,National Briefing Names of the Dead Metro Brie...
4,2004-08-25,17.885534,17.087910,11.901878,2.636839,22.726597,0.496646,12.665086,22.561522,31.939745,...,1860.719971,10181.740234,1104.959961,1.5120,43.470001,407.899994,4.261,1.208605,110.099998,9/11 Panel Leader Has Praise for Plan to Split...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5309,2024-12-25,112.555000,437.039734,92.257973,195.393150,104.538502,257.987671,20.420000,239.999573,143.329117,...,20025.745117,43311.416016,6038.814941,4.2075,69.860001,2629.400024,4.585,1.040258,157.106995,Judge Strikes Down Portions of Arkansas Law Th...
5310,2024-12-26,112.550003,436.432068,92.312691,195.138748,104.582695,258.396667,20.440001,240.409912,143.196320,...,20020.359375,43325.800781,6037.589844,4.2150,69.620003,2638.800049,4.579,1.039955,157.132996,Thursday Briefing: Rebel Factions Try to Unite...
5311,2024-12-27,111.550003,428.881104,91.188515,192.305435,104.572884,254.974930,20.299999,238.462036,142.675003,...,19722.029297,42992.210938,5970.839844,4.1780,70.599998,2617.199951,4.619,1.042318,157.748001,Friday Briefing: How Israel Weakened Civilian ...
5312,2024-12-30,110.800003,423.202911,90.104111,190.789047,103.865776,251.593094,19.820000,236.632812,140.992996,...,19486.789062,42573.730469,5906.939941,4.1820,70.989998,2606.100098,4.545,1.042938,157.873001,The Evening: Record Homelessness in the U.S. Y...


In [2]:
data.columns

Index(['Date', 'Disney', 'Microsoft', 'Walmart', 'Google', 'Exxon Mobil',
       'Apple', 'Intel', 'JP Morgan', 'Johnson & Johnson', 'Coca-Cola',
       'NASDAQ Composite', 'Dow Jones', 'S&P 500', 'Fed Funds Rate', 'Oil',
       'Gold', '10Y Treasury', 'EUR-USD', 'USD-JPY', 'news_text'],
      dtype='object')

In [None]:
# Create a new df where date and news_text stay the same but for every stock create a new column
# For each stock create a array where each element is 1 or 0 depending on if the stock price went up or down relative to the previous day

def create_stock_movement_column(df, stock_name):
    # Create a new column for the stock movement
    df[f"{stock_name}_movement"] = 0

    # Iterate through the DataFrame and set the movement based on the previous day's price
    for i in range(1, len(df)):
        if df[stock_name].iloc[i] > df[stock_name].iloc[i - 1]:
            df[f"{stock_name}_movement"].iloc[i] = 1  # Price went up
        elif df[stock_name].iloc[i] < df[stock_name].iloc[i - 1]:
            df[f"{stock_name}_movement"].iloc[i] = -1  # Price went down
        else:
            df[f"{stock_name}_movement"].iloc[i] = 0  # No change

    return df

# List of stock names to process
data_columns = data.columns[1:len(data.columns) - 1]  # Exclude 'date' and 'news_text'

for stock in data_columns:
    data = create_stock_movement_column(data, stock)

# Save the modified DataFrame to a new CSV file
data


In [5]:
data.columns

Index(['Date', 'Disney', 'Microsoft', 'Walmart', 'Google', 'Exxon Mobil',
       'Apple', 'Intel', 'JP Morgan', 'Johnson & Johnson', 'Coca-Cola',
       'NASDAQ Composite', 'Dow Jones', 'S&P 500', 'Fed Funds Rate', 'Oil',
       'Gold', '10Y Treasury', 'EUR-USD', 'USD-JPY', 'news_text',
       'Disney_movement', 'Microsoft_movement', 'Walmart_movement',
       'Google_movement', 'Exxon Mobil_movement', 'Apple_movement',
       'Intel_movement', 'JP Morgan_movement', 'Johnson & Johnson_movement',
       'Coca-Cola_movement', 'NASDAQ Composite_movement', 'Dow Jones_movement',
       'S&P 500_movement', 'Fed Funds Rate_movement', 'Oil_movement',
       'Gold_movement', '10Y Treasury_movement', 'EUR-USD_movement',
       'USD-JPY_movement'],
      dtype='object')

In [6]:
# Remove the original stock price columns
data = data.drop(columns=data_columns)
data

Unnamed: 0,Date,news_text,Disney_movement,Microsoft_movement,Walmart_movement,Google_movement,Exxon Mobil_movement,Apple_movement,Intel_movement,JP Morgan_movement,...,Coca-Cola_movement,NASDAQ Composite_movement,Dow Jones_movement,S&P 500_movement,Fed Funds Rate_movement,Oil_movement,Gold_movement,10Y Treasury_movement,EUR-USD_movement,USD-JPY_movement
0,2004-08-19,Democrats' Legal Challenges Impede Nader Campa...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2004-08-20,7 More Managers Fired Over Nortel Accounting W...,1,1,-1,1,1,1,-1,1,...,1,1,1,1,1,-1,1,1,-1,-1
2,2004-08-23,World Briefing | Africa: Uganda: 68 Percent In...,-1,1,-1,1,-1,1,1,-1,...,1,1,-1,-1,1,-1,-1,1,-1,1
3,2004-08-24,National Briefing Names of the Dead Metro Brie...,1,0,1,-1,-1,1,-1,1,...,-1,-1,1,1,1,-1,-1,1,-1,-1
4,2004-08-25,9/11 Panel Leader Has Praise for Plan to Split...,1,1,1,1,1,1,1,1,...,1,1,1,1,-1,-1,1,-1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5309,2024-12-25,Judge Strikes Down Portions of Arkansas Law Th...,-1,-1,1,-1,1,1,1,1,...,-1,-1,1,-1,1,-1,1,-1,-1,-1
5310,2024-12-26,Thursday Briefing: Rebel Factions Try to Unite...,-1,-1,1,-1,1,1,1,1,...,-1,-1,1,-1,1,-1,1,-1,-1,1
5311,2024-12-27,Friday Briefing: How Israel Weakened Civilian ...,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,1,-1,1,1,1
5312,2024-12-30,The Evening: Record Homelessness in the U.S. Y...,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,1,-1,-1,1,1


In [7]:
# Save the final DataFrame to a new CSV file
data.to_csv("data/movement_dataset.csv", index=False)