In [1]:
import os
import shutil
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
pd.set_option("display.max_colwidth",None)

In [3]:
from google.cloud import storage

In [4]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled",True) 

In [6]:
twitter_df=spark.read.parquet('gs://msca-bdp-students-bucket/shared_data/kishorkumarreddy/filtered_data_for_analysis_1')
c = twitter_df

23/03/01 18:43:33 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [7]:
twitter_df.createOrReplaceTempView("twitter_df")

In [8]:
query = '''
select retweeted as is_retweeted,
retweeted_status.id as rt_original_id,
retweeted_status.user.screen_name as rt_original_user,
user.screen_name as handle_name,
user.verified as is_verified,
user.followers_count as followers,
user.statuses_count as total_tweets
from twitter_df'''
tw_sq1 = spark.sql(query)

In [9]:
twitter_df1 = tw_sq1.withColumn("tweet_type",when(col("rt_original_id").isNull() != True,"retweet").otherwise("tweet")).filter(col('tweet_type') == 'tweet')

In [10]:
twitter_df1.createOrReplaceTempView("twitter_df")

In [11]:
twitter_df1.printSchema()

root
 |-- is_retweeted: string (nullable = true)
 |-- rt_original_id: long (nullable = true)
 |-- rt_original_user: string (nullable = true)
 |-- handle_name: string (nullable = true)
 |-- is_verified: boolean (nullable = true)
 |-- followers: long (nullable = true)
 |-- total_tweets: long (nullable = true)
 |-- tweet_type: string (nullable = false)



# Based on Overall Content (Tweets), the Most Prolific Tweeters

In [39]:
spark.sql('SELECT handle_name as user, max(total_tweets) as total_content FROM twitter_df GROUP BY user ORDER BY total_content DESC LIMIT 10').toPandas()

                                                                                

Unnamed: 0,user,total_content
0,soldier_777,4327697
1,zazoomblog,4197074
2,PulpNews,4187315
3,missb62,3137543
4,sectest9,3080006
5,marekingu,2897601
6,Knewz_Currently,2850254
7,jornalistavitor,2815923
8,filafresh,2736894
9,paul_cude,2662698


# Based on Original Content (Tweets), the Most Prolific Tweets

In [13]:
spark.sql('SELECT handle_name as user, count(*) as original_content FROM twitter_df WHERE tweet_type = "tweet" GROUP BY user ORDER BY original_content DESC LIMIT 10').toPandas()

                                                                                

Unnamed: 0,user,original_content
0,ParentSecurity,3128
1,AJBlackston,1323
2,PulpNews,1290
3,getthatrightgtr,1147
4,AirLiveRadio,1038
5,Bgm117771,1002
6,MarchingTruth2,873
7,oodlu_tweets,858
8,itsrohitchouhan,857
9,DuoInspirations,837


In [14]:
twitter_df1 = tw_Q1.withColumn("tweet_type",when(col("rt_original_id").isNull() == True,"tweet"))
twitter_df1.createOrReplaceTempView("twitter_df")
twitter_df1.count()

                                                                                

1005167

Identifying the Most Prolific Authors

In [15]:
query = '''select handle_name, 
count(*) 
from twitter_df 
where tweet_type = "tweet" 
group by handle_name 
order by count(*) 
desc limit 10'''
spark.sql(query).toPandas()

                                                                                

Unnamed: 0,handle_name,count(1)
0,ParentSecurity,3128
1,AJBlackston,1323
2,PulpNews,1290
3,getthatrightgtr,1147
4,AirLiveRadio,1038
5,Bgm117771,1002
6,MarchingTruth2,873
7,oodlu_tweets,858
8,itsrohitchouhan,857
9,DuoInspirations,837


# Based on Retweets Count (Most Prolific Retweets)

In [16]:
spark.sql('select rt_original_user, count(*) from twitter_df where rt_original_user IS NOT NULL group by rt_original_user order by count(*) desc').toPandas()

                                                                                

Unnamed: 0,rt_original_user,count(1)
0,Golshifteh,7451
1,GretaThunberg,4504
2,PahlaviReza,3786
3,Hornystepsis1,3638
4,robbeorn,3277
...,...,...
80269,baddiejenny92,1
80270,rouse_dr,1
80271,CabellsMedicine,1
80272,mattitakubel,1


In [17]:
twitter_df.describe()

                                                                                

summary,created_at,favorite_count,filter_level,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,lang,quote_count,quoted_status_id,quoted_status_id_str,quoted_text,reply_count,retweet_count,retweeted,retweeted_from,source,text,timestamp_ms,tweet_text,stripped
count,1005167,1005167.0,1005167,1005167,1005167,33709,27232,27232,33709,33709,1005167,1005167.0,72963,72963,72946,1005167.0,1005167.0,1005167,703935,1005167,1005167,1005167,1005167,1005167
mean,,0.0,,1.566841370743179...,1.566841370743179...,433.0,1.563454478121966...,1.563454478121966...,5.277987172089125...,5.277987172089125...,,0.0,1.559602036301624...,1.559602036301624...,420.0,0.0,0.0,,15417.625925925926,,,1.662399067946775...,,
stddev,,0.0,,3.115244689557993...,3.115244689557993...,,4.269803723556284...,4.269803723556284...,6.307440523906169...,6.307440523906169...,,0.0,5.343131196135945...,5.343131196135945...,,0.0,0.0,,543175.3414855197,,,7.427322124370399E9,,
min,Fri Apr 08 00:00:...,0.0,low,1511197436458459137,1511197436458459137,00Fuzz,1559129819,1018119192426491904,2651,1000060610518896640,en,0.0,124623659041161217,1003686491070128129,!!! Parents Alert...,0.0,0.0,,https,"<a href=""HTTP://b...",! #students #scho...,1649132520199,! #students #scho...,after a lon...
max,Wed Sep 28 23:59:...,0.0,low,1623307508390694913,1623307508390694913,zzzzzryu,1623288183676096512,993241772317913088,1622359362378375171,999988014032945152,en,0.0,1623294581193203714,999010989751259136,🫶~The ❤️ back to...,0.0,0.0,RT,… https,"<a href=""https://...",🫶🏽 Give back to...,1675861644252,🫶🏽 give back to...,🫶🏽 give back to...


In [18]:
df_retweetCounts = twitter_df.select([
    twitter_df.retweet_count.alias("direct_retweet_count"),
    twitter_df.quoted_status.retweet_count.alias("quoted_status.retweet_count"),
    twitter_df.retweeted_status.quoted_status.retweet_count.alias("retweeted_status.quoted_status.retweet_count"),
    twitter_df.retweeted_status.retweet_count.alias("retweeted_status.retweet_count"),
    twitter_df.retweeted_status.reply_count.alias("retweeted_status.reply_count"),
    twitter_df.reply_count.alias("reply_count"),
    twitter_df.tweet_text
]).limit(10000).toPandas()

                                                                                

In [19]:
df_retweetCounts.count()

direct_retweet_count                            10000
quoted_status.retweet_count                       535
retweeted_status.quoted_status.retweet_count      329
retweeted_status.retweet_count                   7935
retweeted_status.reply_count                     7935
reply_count                                     10000
tweet_text                                      10000
dtype: int64

In [20]:
df_retweetCounts.describe()

Unnamed: 0,direct_retweet_count,quoted_status.retweet_count,retweeted_status.quoted_status.retweet_count,retweeted_status.retweet_count,retweeted_status.reply_count,reply_count
count,10000.0,535.0,329.0,7935.0,7935.0,10000.0
mean,0.0,1073.06729,907.468085,1370.853938,232.817139,0.0
std,0.0,2663.100875,2459.589249,2528.899777,425.764715,0.0
min,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,10.0,21.0,19.0,0.0,0.0
50%,0.0,106.0,109.0,184.0,13.0,0.0
75%,0.0,969.5,969.0,1390.5,232.0,0.0
max,0.0,22407.0,20944.0,14588.0,2293.0,0.0


In [21]:
df_original_tweets = twitter_df.filter("retweeted_status.retweet_count is not null") 
df_original_tweets.count()

                                                                                

652559

In [22]:
count1 = twitter_df.count()
count2 = twitter_df.filter('retweeted_status is null').count()
print('Count of all tweets:', count1)
print('Count of all original tweets:', count2)



Count of all tweets: 1005167
Count of all original tweets: 352608


                                                                                

## Identifying the Most Prolific (or) Influential Twitterers Using the Count of Tweets

In [23]:
print(twitter_df.columns)

['coordinates', 'created_at', 'display_text_range', 'entities', 'extended_entities', 'extended_tweet', 'favorite_count', 'favorited', 'filter_level', 'geo', 'id', 'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'place', 'possibly_sensitive', 'quote_count', 'quoted_status', 'quoted_status_id', 'quoted_status_id_str', 'quoted_status_permalink', 'quoted_text', 'reply_count', 'retweet_count', 'retweeted', 'retweeted_from', 'retweeted_status', 'source', 'text', 'timestamp_ms', 'truncated', 'tweet_text', 'user', 'withheld_in_countries', 'stripped']


### Extracting Important Columns and fields from the filtered dataframe

In [40]:
#tw_df.printSchema()

In [27]:
df_coords = twitter_df.select([
    twitter_df.coordinates.alias("direct_coordinates"),
    twitter_df.geo.coordinates.alias("geo.coordinates"),
    twitter_df.place.bounding_box.coordinates.alias("place.bounding_box.coordinates"),
    twitter_df.place.country_code.alias("place.country_code"),
    twitter_df.place.country.alias("place.country"),
    twitter_df.place['name'].alias("place.name"),
    twitter_df.place.full_name.alias("place.full_name"),
    twitter_df.user.location.alias("user.location"),
    twitter_df.tweet_text,
    twitter_df.user.withheld_in_countries
]).limit(30000).toPandas()

                                                                                

In [28]:
df_coords.describe()

Unnamed: 0,direct_coordinates,geo.coordinates,place.bounding_box.coordinates,place.country_code,place.country,place.name,place.full_name,user.location,tweet_text,user.withheld_in_countries
count,72,72,193,193,193,193,193,12523,30000,30000
unique,60,60,161,15,17,162,162,5560,14065,1
top,"([-71.4128343, 41.8239891], Point)","[41.8239891, -71.4128343]","[[[-71.474186, 41.772455], [-71.474186, 41.861713], [-71.369479, 41.861713], [-71.369479, 41.772455]]]",US,United States,Providence,"Providence, RI",United States,let go of students in #sharif_university #mahsaamini https://t.co/jjnszzjjf5,[]
freq,5,5,5,124,123,5,5,225,2563,30000


### Taking the non-null values from the field user.withheld_in_countries to avoid bad plotting

In [29]:
twitter_df.filter("user.withheld_in_countries is not NULL").count()

                                                                                

1005167

In [31]:
twitter_df.limit(5).toPandas().columns

                                                                                

Index(['coordinates', 'created_at', 'display_text_range', 'entities',
       'extended_entities', 'extended_tweet', 'favorite_count', 'favorited',
       'filter_level', 'geo', 'id', 'id_str', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status',
       'lang', 'place', 'possibly_sensitive', 'quote_count', 'quoted_status',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status_permalink',
       'quoted_text', 'reply_count', 'retweet_count', 'retweeted',
       'retweeted_from', 'retweeted_status', 'source', 'text', 'timestamp_ms',
       'truncated', 'tweet_text', 'user', 'withheld_in_countries', 'stripped'],
      dtype='object')

In [32]:
df_coords_analysis_df = twitter_df.select([
    twitter_df.created_at,
    twitter_df.id,
    twitter_df.geo.coordinates.alias("geo_coordinates"),
    twitter_df.user['name'].alias("user_name"), 
    twitter_df.user.followers_count.alias("followers_count"), 
    twitter_df.user.verified.alias("verified_user"),
    twitter_df.user.location.alias("user_location"),
    twitter_df.user.description.alias("user_description"),
    twitter_df.retweeted_status.reply_count.alias("reply_count"),
    twitter_df.retweeted_status.retweet_count.alias("retweet_count"),
    twitter_df.retweeted_status.alias("retweeted_status"),
    twitter_df.tweet_text,
    twitter_df.text,
])itter

df_coords_analysis_df.limit(5).toPandas()

23/03/01 18:57:54 ERROR org.apache.spark.network.client.TransportResponseHandler: Still have 2 requests outstanding when connection from /10.128.0.68:40754 is closed
23/03/01 18:57:54 WARN org.apache.spark.storage.BlockManagerMasterEndpoint: Error trying to remove broadcast 36 from block manager BlockManagerId(32, hub-msca-bdp-dphub-students-kishorkumarreddy-sw-qdhr.c.msca-bdp-students.internal, 40227, None)
java.io.IOException: Connection from /10.128.0.68:40754 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:146)
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(Abstra

Unnamed: 0,created_at,id,geo_coordinates,user_name,followers_count,verified_user,user_location,user_description,reply_count,retweet_count,retweeted_status,tweet_text,text
0,Sat Oct 29 18:36:18 +0000 2022,1586426715760459778,,TitanUp Ty,2256,False,,#LGM #TITANS #GBO🍊 #FJB #NJ #USA 🇺🇸,5,51,"(None, Sat Oct 29 13:22:46 +0000 2022, [0, 82], ([Row(indices=[49, 53], text='SEC'), Row(indices=[54, 70], text='CollegeFootball'), Row(indices=[71, 75], text='cfb'), Row(indices=[76, 82], text='NCAAF')], [Row(additional_media_info=None, description=None, display_url='pic.twitter.com/ra3EkYetqu', expanded_url='https://twitter.com/JWPSports/status/1586347814967132163/photo/1', id=1586347810932154368, id_str='1586347810932154368', indices=[83, 106], media_url='http://pbs.twimg.com/media/FgPW7eDUYAAPV17.jpg', media_url_https='https://pbs.twimg.com/media/FgPW7eDUYAAPV17.jpg', sizes=Row(large=Row(h=1159, resize='fit', w=1172), medium=Row(h=1159, resize='fit', w=1172), small=Row(h=672, resize='fit', w=680), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='photo', url='https://t.co/ra3EkYetqu')], [], [], []), ([Row(additional_media_info=None, description=None, display_url='pic.twitter.com/ra3EkYetqu', expanded_url='https://twitter.com/JWPSports/status/1586347814967132163/photo/1', id=1586347810932154368, id_str='1586347810932154368', indices=[83, 106], media_url='http://pbs.twimg.com/media/FgPW7eDUYAAPV17.jpg', media_url_https='https://pbs.twimg.com/media/FgPW7eDUYAAPV17.jpg', sizes=Row(large=Row(h=1159, resize='fit', w=1172), medium=Row(h=1159, resize='fit', w=1172), small=Row(h=672, resize='fit', w=680), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='photo', url='https://t.co/ra3EkYetqu', video_info=None)],), None, 402, False, low, None, 1586347814967132163, 1586347814967132163, None, None, None, None, None, False, en, None, False, 3, None, None, None, None, 5, 51, False, None, <a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>, Your current SEC College Football QBR leaders.👀\n\n#SEC #CollegeFootball #cfb #NCAAF https://t.co/ra3EkYetqu, False, (False, Fri Jan 31 15:24:06 +0000 2020, True, False, JWP Sports! Be sure to find me on Instagram @jwp_sports | @cfbalerts only on Instagram. Connect with me on all platforms ⬇️, 1935, 4414, 1857, True, 1223265683565416448, 1223265683565416448, False, 6, None, JWP Sports , F5F8FA, , , False, None, http://pbs.twimg.com/profile_images/1225762639957106688/k75frUmv_normal.jpg, https://pbs.twimg.com/profile_images/1225762639957106688/k75frUmv_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, JWPSports, 1623, none, https://linktr.ee/jwpsports, False, None, []), None)",your current sec college football qbr leaders.👀\n\n#sec #collegefootball #cfb #ncaaf https://t.co/ra3ekyetqu,RT @JWPSports: Your current SEC College Football QBR leaders.👀\n\n#SEC #CollegeFootball #cfb #NCAAF https://t.co/ra3EkYetqu
1,Sat Oct 29 18:36:18 +0000 2022,1586426719090589696,,SickYooo,88,False,,..,3,168,"(None, Sun Feb 13 19:58:23 +0000 2022, [0, 140], ([Row(indices=[52, 66], text='collegebaddie'), Row(indices=[67, 75], text='college'), Row(indices=[76, 87], text='latinahead'), Row(indices=[88, 102], text='latinaexposed')], None, [], [Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1492951306398355456', indices=[104, 127], url='https://t.co/tLxG8VA9ih')], []), None, ([0, 155], ([Row(indices=[52, 66], text='collegebaddie'), Row(indices=[67, 75], text='college'), Row(indices=[76, 87], text='latinahead'), Row(indices=[88, 102], text='latinaexposed'), Row(indices=[103, 116], text='LatinaBeauty'), Row(indices=[117, 130], text='sloppythroat'), Row(indices=[131, 147], text='sloppyheadGIVER'), Row(indices=[148, 155], text='blowie')], [Row(additional_media_info=Row(description=None, embeddable=None, monetizable=False, title=None), description=None, display_url='pic.twitter.com/GhG6abUn99', expanded_url='https://twitter.com/pxrnstachee/status/1492951306398355456/video/1', id=1492951256318431232, id_str='1492951256318431232', indices=[156, 179], media_url='http://pbs.twimg.com/ext_tw_video_thumb/1492951256318431232/pu/img/PeBtNX6QBgdDQZHX.jpg', media_url_https='https://pbs.twimg.com/ext_tw_video_thumb/1492951256318431232/pu/img/PeBtNX6QBgdDQZHX.jpg', sizes=Row(large=Row(h=1280, resize='fit', w=732), medium=Row(h=1200, resize='fit', w=686), small=Row(h=680, resize='fit', w=389), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='video', url='https://t.co/GhG6abUn99', video_info=Row(aspect_ratio=[183, 320], duration_millis=45001, variants=[Row(bitrate=632000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1492951256318431232/pu/vid/320x558/hDDxsZ1Kgt_l7sg3.mp4?tag=12'), Row(bitrate=None, content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1492951256318431232/pu/pl/np_2fWZzuV1YYGWs.m3u8?tag=12&container=fmp4'), Row(bitrate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1492951256318431232/pu/vid/720x1258/OSWvqvJs35tJrRRC.mp4?tag=12'), Row(bitrate=950000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1492951256318431232/pu/vid/480x838/ZqGt4Qzk4jREnItM.mp4?tag=12')]))], [], [], []), ([Row(additional_media_info=Row(description=None, embeddable=None, monetizable=False, title=None), description=None, display_url='pic.twitter.com/GhG6abUn99', expanded_url='https://twitter.com/pxrnstachee/status/1492951306398355456/video/1', id=1492951256318431232, id_str='1492951256318431232', indices=[156, 179], media_url='http://pbs.twimg.com/ext_tw_video_thumb/1492951256318431232/pu/img/PeBtNX6QBgdDQZHX.jpg', media_url_https='https://pbs.twimg.com/ext_tw_video_thumb/1492951256318431232/pu/img/PeBtNX6QBgdDQZHX.jpg', sizes=Row(large=Row(h=1280, resize='fit', w=732), medium=Row(h=1200, resize='fit', w=686), small=Row(h=680, resize='fit', w=389), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='video', url='https://t.co/GhG6abUn99', video_info=Row(aspect_ratio=[183, 320], duration_millis=45001, variants=[Row(bitrate=632000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1492951256318431232/pu/vid/320x558/hDDxsZ1Kgt_l7sg3.mp4?tag=12'), Row(bitrate=None, content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1492951256318431232/pu/pl/np_2fWZzuV1YYGWs.m3u8?tag=12&container=fmp4'), Row(bitrate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1492951256318431232/pu/vid/720x1258/OSWvqvJs35tJrRRC.mp4?tag=12'), Row(bitrate=950000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1492951256318431232/pu/vid/480x838/ZqGt4Qzk4jREnItM.mp4?tag=12')]))],), Latina baddie wanted to suck me off during a party😫 #collegebaddie #college #latinahead #latinaexposed #LatinaBeauty #sloppythroat #sloppyheadGIVER #blowie https://t.co/GhG6abUn99), 1275, False, low, None, 1492951306398355456, 1492951306398355456, None, None, None, None, None, False, en, None, True, 0, None, None, None, None, 3, 168, False, None, <a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>, Latina baddie wanted to suck me off during a party😫 #collegebaddie #college #latinahead #latinaexposed… https://t.co/tLxG8VA9ih, True, (False, Thu May 10 01:13:52 +0000 2018, True, False, | 18+ Daily Content | 🌹No Content Is Owned By The Maker Of This Account🌹| Dm for credit/removal no copyright infringement intended, 8017, 64513, 45, False, 994384995740352512, 994384995740352512, False, 216, None, ParadisePxrn (dm for cheap promo), F5F8FA, , , False, https://pbs.twimg.com/profile_banners/994384995740352512/1666345959, http://pbs.twimg.com/profile_images/1583363185045667840/IYjjH-zG_normal.jpg, https://pbs.twimg.com/profile_images/1583363185045667840/IYjjH-zG_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, pxrnstachee, 565, none, https://discord.gg/sullen, False, None, []), None)",latina baddie wanted to suck me off during a party😫 #collegebaddie #college #latinahead #latinaexposed #latinabeauty #sloppythroat #sloppyheadgiver #blowie https://t.co/ghg6abun99,RT @pxrnstachee: Latina baddie wanted to suck me off during a party😫 #collegebaddie #college #latinahead #latinaexposed #LatinaBeauty #slop…
2,Sat Oct 29 18:36:55 +0000 2022,1586426871524343814,,Saturn,883,False,America,#براندازم #نه_به_جمهوری_اسلامی,0,7,"(None, Sat Oct 29 18:23:54 +0000 2022, None, ([Row(indices=[37, 56], text='UniversityofTehran')], None, [], [Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1586423598645841922', indices=[117, 140], url='https://t.co/J61gZ0Gy9z')], []), None, ([0, 279], ([Row(indices=[37, 56], text='UniversityofTehran'), Row(indices=[267, 279], text='Mahsa_Amini')], None, [], [], []), None, Today’s peaceful student protests of #UniversityofTehran, College of Engineering was attacked by basij and plainclothes security forces (who had illegally entered the campus). Security forces waiting outside arresting and taking students away to undisclosed location #Mahsa_Amini), 11, False, low, None, 1586423598645841922, 1586423598645841922, None, None, None, None, None, True, en, None, None, 0, (None, Sat Oct 29 18:00:59 +0000 2022, None, ([Row(indices=[11, 23], text='دانشکده_فنی')], None, [], [Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1586417830228267009', indices=[117, 140], url='https://t.co/Fw2dvhchXt')], []), None, ([0, 279], Row(hashtags=[Row(indices=[11, 23], text='دانشکده_فنی'), Row(indices=[268, 279], text='مهسا_امینی')], media=None, symbols=[], urls=[], user_mentions=[]), None, ۱/امروز در #دانشکده_فنی دانشگاه تهران، دانشجویان با تجمع مسالمت‌آمیز، شعار دادند و خواسته‌های خود را مطرح کردند اما مورد حمله نیروهای بسیج دانشجویی قرار گرفتند. نیروهای لباس شخصی به طور غیرقانونی از صبح در دانشگاه مستقر شده بودند و با بسیجی‌ها در این حمله همراه شدند.\n#مهسا_امینی), 63, False, low, None, 1586417830228267009, 1586417830228267009, None, None, None, None, None, False, fa, None, None, 1, None, None, 1, 25, False, None, <a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>, ۱/امروز در #دانشکده_فنی دانشگاه تهران، دانشجویان با تجمع مسالمت‌آمیز، شعار دادند و خواسته‌های خود را مطرح کردند اما… https://t.co/Fw2dvhchXt, True, (False, Tue Aug 11 07:17:35 +0000 2015, False, False, Official account of Iran Human Rights (IHR NGO)- Also visit @iranhr حساب رسمی توییتر سازمان حقوق بشر ایران, 1067, 9849, 284, True, 3414505683, 3414505683, False, 120, Oslo, Norway, Iran Human Rights (IHR NGO), 000000, http://abs.twimg.com/images/themes/theme1/bg.png, https://abs.twimg.com/images/themes/theme1/bg.png, False, https://pbs.twimg.com/profile_banners/3414505683/1665403153, http://pbs.twimg.com/profile_images/631002652361424896/NWkvQXd-_normal.png, https://pbs.twimg.com/profile_images/631002652361424896/NWkvQXd-_normal.png, 89C9FA, 000000, 000000, 000000, False, False, IHRights, 8255, none, https://iranhr.net/, True, None, []), None), 1586417830228267009, 1586417830228267009, (twitter.com/ihrights/statu…, https://twitter.com/ihrights/status/1586417830228267009, https://t.co/5MHIgxrjHW), 0, 7, False, None, <a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>, Today’s peaceful student protests of #UniversityofTehran, College of Engineering was attacked by basij and plainclo… https://t.co/J61gZ0Gy9z, True, (False, Sat May 02 13:32:40 +0000 2009, True, False, Director of NGO «Iran Human Rights"" @IHRights Professor at the University of Oslo- مدیر سازمان حقوق بشر ایران- پزشک و استاددانشکده پزشکی دانشگاه اسلو, 9660, 5589, 1642, True, 37196329, 37196329, False, 173, Global, Mahmood Amiry-Moghaddam, C0DEED, http://abs.twimg.com/images/themes/theme1/bg.png, https://abs.twimg.com/images/themes/theme1/bg.png, False, https://pbs.twimg.com/profile_banners/37196329/1425896261, http://pbs.twimg.com/profile_images/1306244699716489216/BfQaxpEW_normal.jpg, https://pbs.twimg.com/profile_images/1306244699716489216/BfQaxpEW_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, iranhr, 11461, none, http://www.iranhr.net, True, None, []), None)","today’s peaceful student protests of #universityoftehran, college of engineering was attacked by basij and plainclothes security forces (who had illegally entered the campus). security forces waiting outside arresting and taking students away to undisclosed location #mahsa_amini","RT @iranhr: Today’s peaceful student protests of #UniversityofTehran, College of Engineering was attacked by basij and plainclothes securit…"
3,Sat Oct 29 18:38:00 +0000 2022,1586427143302443008,,💙KOHAKU225💙,127,False,"Baton Rouge, LA",,3,16,"(None, Fri Oct 28 23:51:52 +0000 2022, [0, 140], ([Row(indices=[45, 48], text='bj'), Row(indices=[49, 53], text='pyt'), Row(indices=[54, 61], text='baddie'), Row(indices=[62, 69], text='leaked'), Row(indices=[70, 76], text='oussy'), Row(indices=[77, 87], text='backshots'), Row(indices=[88, 103], text='sellingcontent'), Row(indices=[104, 115], text='schoolthot')], None, [], [Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1586143745078353920', indices=[117, 140], url='https://t.co/Vdd4JmZ8zn')], []), None, ([0, 187], ([Row(indices=[45, 48], text='bj'), Row(indices=[49, 53], text='pyt'), Row(indices=[54, 61], text='baddie'), Row(indices=[62, 69], text='leaked'), Row(indices=[70, 76], text='oussy'), Row(indices=[77, 87], text='backshots'), Row(indices=[88, 103], text='sellingcontent'), Row(indices=[104, 115], text='schoolthot'), Row(indices=[116, 119], text='dm'), Row(indices=[120, 125], text='head'), Row(indices=[126, 137], text='throatgoat'), Row(indices=[138, 142], text='bbc'), Row(indices=[143, 150], text='sloppy'), Row(indices=[151, 157], text='leaks'), Row(indices=[158, 166], text='college'), Row(indices=[167, 172], text='nsfw'), Row(indices=[173, 179], text='ebony'), Row(indices=[180, 187], text='pyteen')], [Row(additional_media_info=Row(description=None, embeddable=None, monetizable=False, title=None), description=None, display_url='pic.twitter.com/XKFQqhgNTN', expanded_url='https://twitter.com/freaky_central2/status/1586143745078353920/video/1', id=1586143673661677569, id_str='1586143673661677569', indices=[188, 211], media_url='http://pbs.twimg.com/ext_tw_video_thumb/1586143673661677569/pu/img/5hJNsSHYfpGRBKXF.jpg', media_url_https='https://pbs.twimg.com/ext_tw_video_thumb/1586143673661677569/pu/img/5hJNsSHYfpGRBKXF.jpg', sizes=Row(large=Row(h=1280, resize='fit', w=592), medium=Row(h=1200, resize='fit', w=555), small=Row(h=680, resize='fit', w=315), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='video', url='https://t.co/XKFQqhgNTN', video_info=Row(aspect_ratio=[37, 80], duration_millis=27627, variants=[Row(bitrate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1586143673661677569/pu/vid/592x1280/gjcYcZKo5e-ZYGGa.mp4?tag=12'), Row(bitrate=None, content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1586143673661677569/pu/pl/LVnxWoY7O9mLNE5G.m3u8?tag=12&container=fmp4'), Row(bitrate=632000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1586143673661677569/pu/vid/320x690/8hHnDD2IseH3kEhM.mp4?tag=12'), Row(bitrate=950000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1586143673661677569/pu/vid/480x1036/RLYUVMasvBGnSSvI.mp4?tag=12')]))], [], [], []), ([Row(additional_media_info=Row(description=None, embeddable=None, monetizable=False, title=None), description=None, display_url='pic.twitter.com/XKFQqhgNTN', expanded_url='https://twitter.com/freaky_central2/status/1586143745078353920/video/1', id=1586143673661677569, id_str='1586143673661677569', indices=[188, 211], media_url='http://pbs.twimg.com/ext_tw_video_thumb/1586143673661677569/pu/img/5hJNsSHYfpGRBKXF.jpg', media_url_https='https://pbs.twimg.com/ext_tw_video_thumb/1586143673661677569/pu/img/5hJNsSHYfpGRBKXF.jpg', sizes=Row(large=Row(h=1280, resize='fit', w=592), medium=Row(h=1200, resize='fit', w=555), small=Row(h=680, resize='fit', w=315), thumb=Row(h=150, resize='crop', w=150)), source_status_id=None, source_status_id_str=None, source_user_id=None, source_user_id_str=None, type='video', url='https://t.co/XKFQqhgNTN', video_info=Row(aspect_ratio=[37, 80], duration_millis=27627, variants=[Row(bitrate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1586143673661677569/pu/vid/592x1280/gjcYcZKo5e-ZYGGa.mp4?tag=12'), Row(bitrate=None, content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1586143673661677569/pu/pl/LVnxWoY7O9mLNE5G.m3u8?tag=12&container=fmp4'), Row(bitrate=632000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1586143673661677569/pu/vid/320x690/8hHnDD2IseH3kEhM.mp4?tag=12'), Row(bitrate=950000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1586143673661677569/pu/vid/480x1036/RLYUVMasvBGnSSvI.mp4?tag=12')]))],), First 100 follows gets a surprise \nOpen dms\n\n#bj #pyt #baddie #leaked #oussy #backshots #sellingcontent #schoolthot #dm #head #throatgoat #bbc #sloppy #leaks #college #nsfw #ebony #pyteen https://t.co/XKFQqhgNTN), 155, False, low, None, 1586143745078353920, 1586143745078353920, None, None, None, None, None, False, en, None, False, 0, None, None, None, None, 3, 16, False, None, <a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>, First 100 follows gets a surprise \nOpen dms\n\n#bj #pyt #baddie #leaked #oussy #backshots #sellingcontent #schoolthot… https://t.co/Vdd4JmZ8zn, True, (False, Tue Oct 25 19:35:23 +0000 2022, True, False, everything u need in one place; DM for credit or removal DAILY POST | FOLLOW FOR MORE| SELLING PERSONAL CONTENT DMS OPEN | DM FOR CHEAP PROMO #promo #ads, 82, 2627, 1, False, 1584991955146121217, 1584991955146121217, False, 12, None, FC🔞, F5F8FA, , , False, https://pbs.twimg.com/profile_banners/1584991955146121217/1666726989, http://pbs.twimg.com/profile_images/1584993944756256799/ZOWKxRRp_normal.jpg, https://pbs.twimg.com/profile_images/1584993944756256799/ZOWKxRRp_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, freaky_central2, 87, none, None, False, None, []), None)",first 100 follows gets a surprise \nopen dms\n\n#bj #pyt #baddie #leaked #oussy #backshots #sellingcontent #schoolthot #dm #head #throatgoat #bbc #sloppy #leaks #college #nsfw #ebony #pyteen https://t.co/xkfqqhgntn,RT @freaky_central2: First 100 follows gets a surprise \nOpen dms\n\n#bj #pyt #baddie #leaked #oussy #backshots #sellingcontent #schoolthot #d…
4,Sat Oct 29 18:38:43 +0000 2022,1586427325264310273,,maoooo,6,False,,,2,10,"(None, Sat Oct 29 18:23:54 +0000 2022, None, ([Row(indices=[37, 56], text='UniversityofTehran')], None, [], [Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1586423598645841922', indices=[117, 140], url='https://t.co/J61gZ0Gy9z')], []), None, ([0, 279], ([Row(indices=[37, 56], text='UniversityofTehran'), Row(indices=[267, 279], text='Mahsa_Amini')], None, [], [], []), None, Today’s peaceful student protests of #UniversityofTehran, College of Engineering was attacked by basij and plainclothes security forces (who had illegally entered the campus). Security forces waiting outside arresting and taking students away to undisclosed location #Mahsa_Amini), 12, False, low, None, 1586423598645841922, 1586423598645841922, None, None, None, None, None, True, en, None, None, 0, (None, Sat Oct 29 18:00:59 +0000 2022, None, ([Row(indices=[11, 23], text='دانشکده_فنی')], None, [], [Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1586417830228267009', indices=[117, 140], url='https://t.co/Fw2dvhchXt')], []), None, ([0, 279], Row(hashtags=[Row(indices=[11, 23], text='دانشکده_فنی'), Row(indices=[268, 279], text='مهسا_امینی')], media=None, symbols=[], urls=[], user_mentions=[]), None, ۱/امروز در #دانشکده_فنی دانشگاه تهران، دانشجویان با تجمع مسالمت‌آمیز، شعار دادند و خواسته‌های خود را مطرح کردند اما مورد حمله نیروهای بسیج دانشجویی قرار گرفتند. نیروهای لباس شخصی به طور غیرقانونی از صبح در دانشگاه مستقر شده بودند و با بسیجی‌ها در این حمله همراه شدند.\n#مهسا_امینی), 64, False, low, None, 1586417830228267009, 1586417830228267009, None, None, None, None, None, False, fa, None, None, 1, None, None, 1, 26, False, None, <a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>, ۱/امروز در #دانشکده_فنی دانشگاه تهران، دانشجویان با تجمع مسالمت‌آمیز، شعار دادند و خواسته‌های خود را مطرح کردند اما… https://t.co/Fw2dvhchXt, True, (False, Tue Aug 11 07:17:35 +0000 2015, False, False, Official account of Iran Human Rights (IHR NGO)- Also visit @iranhr حساب رسمی توییتر سازمان حقوق بشر ایران, 1067, 9848, 284, True, 3414505683, 3414505683, False, 119, Oslo, Norway, Iran Human Rights (IHR NGO), 000000, http://abs.twimg.com/images/themes/theme1/bg.png, https://abs.twimg.com/images/themes/theme1/bg.png, False, https://pbs.twimg.com/profile_banners/3414505683/1665403153, http://pbs.twimg.com/profile_images/631002652361424896/NWkvQXd-_normal.png, https://pbs.twimg.com/profile_images/631002652361424896/NWkvQXd-_normal.png, 89C9FA, 000000, 000000, 000000, False, False, IHRights, 8255, none, https://iranhr.net/, True, None, []), None), 1586417830228267009, 1586417830228267009, (twitter.com/ihrights/statu…, https://twitter.com/ihrights/status/1586417830228267009, https://t.co/5MHIgxrjHW), 2, 10, False, None, <a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>, Today’s peaceful student protests of #UniversityofTehran, College of Engineering was attacked by basij and plainclo… https://t.co/J61gZ0Gy9z, True, (False, Sat May 02 13:32:40 +0000 2009, True, False, Director of NGO «Iran Human Rights"" @IHRights Professor at the University of Oslo- مدیر سازمان حقوق بشر ایران- پزشک و استاددانشکده پزشکی دانشگاه اسلو, 9660, 5590, 1642, True, 37196329, 37196329, False, 173, Global, Mahmood Amiry-Moghaddam, C0DEED, http://abs.twimg.com/images/themes/theme1/bg.png, https://abs.twimg.com/images/themes/theme1/bg.png, False, https://pbs.twimg.com/profile_banners/37196329/1425896261, http://pbs.twimg.com/profile_images/1306244699716489216/BfQaxpEW_normal.jpg, https://pbs.twimg.com/profile_images/1306244699716489216/BfQaxpEW_normal.jpg, 1DA1F2, C0DEED, DDEEF6, 333333, True, False, iranhr, 11461, none, http://www.iranhr.net, True, None, []), None)","today’s peaceful student protests of #universityoftehran, college of engineering was attacked by basij and plainclothes security forces (who had illegally entered the campus). security forces waiting outside arresting and taking students away to undisclosed location #mahsa_amini","RT @iranhr: Today’s peaceful student protests of #UniversityofTehran, College of Engineering was attacked by basij and plainclothes securit…"


In [34]:
#Taking the null count to check for any discrepancies
df_nuls_cnt = df_coords_analysis_df.select([
                F.count(
                    F.when(df_coords_analysis_df[column].isNull(), column)
                ).alias(column) for column in df_coords_analysis_df.columns
            ]).toPandas()

# display.max_columns : int

                                                                                

In [36]:
df_nuls_cnt

Unnamed: 0,created_at,id,geo_coordinates,user_name,followers_count,verified_user,user_location,user_description,reply_count,retweet_count,retweeted_status,tweet_text,text
0,0,0,997584,0,0,0,451915,231810,352608,352608,352608,0,0


In [41]:
%%time 
df_coords_analysis_df.write.mode("overwrite").parquet("gs://msca-bdp-students-bucket/shared_data/kishorkumarreddy/df_coords_analysis_df/")

                                                                                

CPU times: user 240 ms, sys: 30.4 ms, total: 270 ms
Wall time: 3min 46s
