In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
# Read csv files to dataframes
us_re = pd.read_csv('re_usa.csv')
us_ce = pd.read_csv('ce_usa.csv')
us_ge = pd.read_csv('ge_usa.csv')
eu_re = pd.read_csv('re_eu.csv')
eu_ce = pd.read_csv('ce_eu.csv')
eu_ge = pd.read_csv('ge_eu.csv')

# Data Understanding

In [3]:
print('No. of Renewable Energy tweets from the US:', len(us_re))
print('No. of Clean Energy tweets from the US:', len(us_ce))
print('No. of Green Energy tweets from the US:', len(us_ge))
print('No. of Renewable Energy tweets from Europe:', len(eu_re))
print('No. of Clean Energy tweets from Europe:', len(eu_ce))
print('No. of Green Energy tweets from Europe:', len(eu_ge))


No. of Renewable Energy tweets from the US: 3637
No. of Clean Energy tweets from the US: 5317
No. of Green Energy tweets from the US: 2138
No. of Renewable Energy tweets from Europe: 3689
No. of Clean Energy tweets from Europe: 1091
No. of Green Energy tweets from Europe: 2344


# Data Preparation

### Combining tweet data

In [4]:
# Add labels to data in each df before combining
us_re['label'] = 'us_re'
us_ce['label'] = 'us_ce'
us_ge['label'] = 'us_ge'
eu_re['label'] = 'eu_re'
eu_ce['label'] = 'eu_ce'
eu_ge['label'] = 'eu_ge'

eu_ce.head()

Unnamed: 0,date_time,content,hashtags,language,user_location,coord,place,label
0,2022-05-31 10:14:42+00:00,I joined Canada’s Minister of Environment and ...,"['Stockholm', 'ClimateAction', 'cleanenergy', ...",en,"Stockholm, Sweden","Coordinates(longitude=17.7601322, latitude=59....","Place(fullName='Stockholm, Sweden', name='Stoc...",eu_ce
1,2022-05-31 08:40:30+00:00,The Green Deal relies on the same false soluti...,['EUGreenWeek'],en,"Ghent, Belgium","Coordinates(longitude=4.3139889, latitude=50.7...","Place(fullName='Brussels, Belgium', name='Brus...",eu_ce
2,2022-05-30 14:26:35+00:00,"From aluminum to zinc, these are the building ...",,en,Gijon. Spain,"Coordinates(longitude=-5.8209772, latitude=43....","Place(fullName='Gijón, Spain', name='Gijón', t...",eu_ce
3,2022-05-30 10:23:10+00:00,This will not rock as major economy's that rel...,,en,"England, United Kingdom","Coordinates(longitude=-2.3798084, latitude=53....","Place(fullName='Sale, England', name='Sale', t...",eu_ce
4,2022-05-30 08:56:30+00:00,Absence of meaningful EU/G7/MDB offer on clean...,['COP27'],en,UK,"Coordinates(longitude=-6.36850399906372, latit...","Place(fullName='South West, England', name='So...",eu_ce


In [5]:
# Combine data into one DataFrame and sort by date and time
us_re['date_time'] = pd.to_datetime(us_re['date_time'])
us_ce['date_time'] = pd.to_datetime(us_ce['date_time'])
us_ge['date_time'] = pd.to_datetime(us_ge['date_time'])

eu_re['date_time'] = pd.to_datetime(eu_re['date_time'])
eu_ce['date_time'] = pd.to_datetime(eu_ce['date_time'])
eu_ge['date_time'] = pd.to_datetime(eu_ge['date_time'])

df = pd.concat([us_re, us_ce, us_ge, eu_re, eu_ce, eu_ge], ignore_index = True)
df.sort_values(by = 'date_time', inplace = True)
df.reset_index(drop = True, inplace = True)
df


Unnamed: 0,date_time,content,hashtags,language,user_location,coord,place,label
0,2021-06-01 00:33:58+00:00,@sparki1969 I guess he’s never spoken to the N...,,en,"Houston, TX","Coordinates(longitude=-95.823268, latitude=29....","Place(fullName='Houston, TX', name='Houston', ...",us_ce
1,2021-06-01 00:34:00+00:00,WE NEED A MANHATTEN PROJECT TO CLEAN UP PLASTI...,,en,"Northern, California.","Coordinates(longitude=-122.452708, latitude=40...","Place(fullName='Redding, CA', name='Redding', ...",us_ce
2,2021-06-01 00:34:00+00:00,WE NEED A MANHATTEN PROJECT TO CLEAN UP PLASTI...,,en,"Northern, California.","Coordinates(longitude=-122.452708, latitude=40...","Place(fullName='Redding, CA', name='Redding', ...",us_ge
3,2021-06-01 00:58:15+00:00,Really?! We ordered the @Ford #MustangMachE on...,"['MustangMachE', 'ElectricVehicles', 'Tesla', ...",en,,"Coordinates(longitude=-75.7887564, latitude=38...","Place(fullName='Delaware, USA', name='Delaware...",us_ce
4,2021-06-01 01:56:14+00:00,TONIGHT: There are only a few hours left to pa...,"['climate', 'FossilFreeFuture', 'CleanEnergy',...",en,"Downers Grove, IL","Coordinates(longitude=-88.0966885, latitude=41...","Place(fullName='Downers Grove, IL', name='Down...",us_ce
...,...,...,...,...,...,...,...,...
18211,2022-05-31 23:07:40+00:00,The second area of strategic partnership highl...,['cleanenergy'],en,"West Lafayette, IN","Coordinates(longitude=-86.999431, latitude=40....","Place(fullName='West Lafayette, IN', name='Wes...",us_ce
18212,2022-05-31 23:26:52+00:00,Here at NV Energy's headquarters where @SenCor...,,en,"Las Vegas, NV","Coordinates(longitude=-115.384091, latitude=36...","Place(fullName='Las Vegas, NV', name='Las Vega...",us_re
18213,2022-05-31 23:27:30+00:00,@GGMM_2020 @pros_and_hoes @ShelKel2 @CourtneyM...,,en,"Valley Forge, Pa.","Coordinates(longitude=-75.471249, latitude=40....","Place(fullName='Schwenksville, PA', name='Schw...",us_re
18214,2022-05-31 23:33:57+00:00,@DebHaalandNM announces two new renewable ener...,,en,"Las Vegas, NV","Coordinates(longitude=-115.384091, latitude=36...","Place(fullName='Las Vegas, NV', name='Las Vega...",us_re


In [6]:
# No. of duplicated tweets
df['content'].duplicated().sum()

960

In [7]:
# Remove duplicate tweets
df.drop_duplicates(subset = 'content', keep = 'first', inplace = True, ignore_index = True)
df

Unnamed: 0,date_time,content,hashtags,language,user_location,coord,place,label
0,2021-06-01 00:33:58+00:00,@sparki1969 I guess he’s never spoken to the N...,,en,"Houston, TX","Coordinates(longitude=-95.823268, latitude=29....","Place(fullName='Houston, TX', name='Houston', ...",us_ce
1,2021-06-01 00:34:00+00:00,WE NEED A MANHATTEN PROJECT TO CLEAN UP PLASTI...,,en,"Northern, California.","Coordinates(longitude=-122.452708, latitude=40...","Place(fullName='Redding, CA', name='Redding', ...",us_ce
2,2021-06-01 00:58:15+00:00,Really?! We ordered the @Ford #MustangMachE on...,"['MustangMachE', 'ElectricVehicles', 'Tesla', ...",en,,"Coordinates(longitude=-75.7887564, latitude=38...","Place(fullName='Delaware, USA', name='Delaware...",us_ce
3,2021-06-01 01:56:14+00:00,TONIGHT: There are only a few hours left to pa...,"['climate', 'FossilFreeFuture', 'CleanEnergy',...",en,"Downers Grove, IL","Coordinates(longitude=-88.0966885, latitude=41...","Place(fullName='Downers Grove, IL', name='Down...",us_ce
4,2021-06-01 02:57:52+00:00,@Fiorella_im Water is infinite. Energy to acce...,,en,,"Coordinates(longitude=-118.017789, latitude=33...","Place(fullName='Anaheim, CA', name='Anaheim', ...",us_re
...,...,...,...,...,...,...,...,...
17251,2022-05-31 23:07:40+00:00,The second area of strategic partnership highl...,['cleanenergy'],en,"West Lafayette, IN","Coordinates(longitude=-86.999431, latitude=40....","Place(fullName='West Lafayette, IN', name='Wes...",us_re
17252,2022-05-31 23:26:52+00:00,Here at NV Energy's headquarters where @SenCor...,,en,"Las Vegas, NV","Coordinates(longitude=-115.384091, latitude=36...","Place(fullName='Las Vegas, NV', name='Las Vega...",us_re
17253,2022-05-31 23:27:30+00:00,@GGMM_2020 @pros_and_hoes @ShelKel2 @CourtneyM...,,en,"Valley Forge, Pa.","Coordinates(longitude=-75.471249, latitude=40....","Place(fullName='Schwenksville, PA', name='Schw...",us_re
17254,2022-05-31 23:33:57+00:00,@DebHaalandNM announces two new renewable ener...,,en,"Las Vegas, NV","Coordinates(longitude=-115.384091, latitude=36...","Place(fullName='Las Vegas, NV', name='Las Vega...",us_re


In [8]:
# No. of tweets in each category after removing duplicates
df['label'].value_counts()

us_ce    5093
eu_re    3524
us_re    3439
eu_ge    2182
us_ge    2027
eu_ce     991
Name: label, dtype: int64

### Cleaning Tweet Text

In [9]:
## Function to clean tweets and remove unwanted information (e.g. URLs, mentions, etc.)
def clean_tweets(text):
    # Remove 'RT'
    text = re.sub('RT @[\w]*:', '', text)
    # Remove twitter handles from mentions ('@user')
    text = re.sub('(@[A-Za-z0-9_]+)', '', text)
    # Remove URLs
    text = re.sub('https?://[A-Za-z0-9./]*', '', text)
    # Change HTML ampersand formatting
    text = re.sub('&amp;', 'and', text)
    # Remove whitespaces
    text = ' '.join(text.split())

    return text

In [10]:
# View original uncleaned tweet data
df['content'].iloc[3]

'TONIGHT: There are only a few hours left to pass a comprehensive #climate bill that prioritizes equity, jobs, &amp; justice! @DonHarmonIL, @RepChrisWelch - nearly 50 legislators won’t vote for an energy bill without a #FossilFreeFuture. #CleanEnergy #twill https://t.co/Z0D6yooaTw'

In [11]:
# View original uncleaned tweet data
df['content'].iloc[30]

'Switching to clean energy with Solar Tech Elec has never been easier. With $0 down finance available. There’s no better time to lock into a fixed rate lower than your current electricity bill.\n\n☎️(727) 488 8634\n\nhttps://t.co/vdlnyDY0N2\n\n#cleanenergy #solarpower #solartechelec https://t.co/q2LyA7UnR8'

In [12]:
# Apply tweet cleaning function to DataFrame
df['tweet'] = df['content'].apply(lambda x:clean_tweets(x))

# View cleaned tweet data
df['tweet'].iloc[3]

'TONIGHT: There are only a few hours left to pass a comprehensive #climate bill that prioritizes equity, jobs, and justice! , - nearly 50 legislators won’t vote for an energy bill without a #FossilFreeFuture. #CleanEnergy #twill'

In [13]:
# View cleaned tweet data
df['tweet'].iloc[30]

'Switching to clean energy with Solar Tech Elec has never been easier. With $0 down finance available. There’s no better time to lock into a fixed rate lower than your current electricity bill. ☎️(727) 488 8634 #cleanenergy #solarpower #solartechelec'

In [14]:
# Remove unwanted data fields
df = df[['date_time', 'coord', 'place', 'label', 'hashtags', 'tweet']]
df

Unnamed: 0,date_time,coord,place,label,hashtags,tweet
0,2021-06-01 00:33:58+00:00,"Coordinates(longitude=-95.823268, latitude=29....","Place(fullName='Houston, TX', name='Houston', ...",us_ce,,"I guess he’s never spoken to the Navajo, Shosh..."
1,2021-06-01 00:34:00+00:00,"Coordinates(longitude=-122.452708, latitude=40...","Place(fullName='Redding, CA', name='Redding', ...",us_ce,,WE NEED A MANHATTEN PROJECT TO CLEAN UP PLASTI...
2,2021-06-01 00:58:15+00:00,"Coordinates(longitude=-75.7887564, latitude=38...","Place(fullName='Delaware, USA', name='Delaware...",us_ce,"['MustangMachE', 'ElectricVehicles', 'Tesla', ...",Really?! We ordered the #MustangMachE on April...
3,2021-06-01 01:56:14+00:00,"Coordinates(longitude=-88.0966885, latitude=41...","Place(fullName='Downers Grove, IL', name='Down...",us_ce,"['climate', 'FossilFreeFuture', 'CleanEnergy',...",TONIGHT: There are only a few hours left to pa...
4,2021-06-01 02:57:52+00:00,"Coordinates(longitude=-118.017789, latitude=33...","Place(fullName='Anaheim, CA', name='Anaheim', ...",us_re,,Water is infinite. Energy to access it can be ...
...,...,...,...,...,...,...
17251,2022-05-31 23:07:40+00:00,"Coordinates(longitude=-86.999431, latitude=40....","Place(fullName='West Lafayette, IN', name='Wes...",us_re,['cleanenergy'],The second area of strategic partnership highl...
17252,2022-05-31 23:26:52+00:00,"Coordinates(longitude=-115.384091, latitude=36...","Place(fullName='Las Vegas, NV', name='Las Vega...",us_re,,Here at NV Energy's headquarters where is spea...
17253,2022-05-31 23:27:30+00:00,"Coordinates(longitude=-75.471249, latitude=40....","Place(fullName='Schwenksville, PA', name='Schw...",us_re,,You are so right. To truly be energy independe...
17254,2022-05-31 23:33:57+00:00,"Coordinates(longitude=-115.384091, latitude=36...","Place(fullName='Las Vegas, NV', name='Las Vega...",us_re,,announces two new renewable energy initiatives...


In [15]:
# Export cleaned tweet data to csv file
df.to_csv('cleaned_tweets_combined.csv', index = False)