In [2]:
import json
import pandas as pd
from textblob import TextBlob


def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))
    
    
    return len(tweets_data), tweets_data

class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        
        self.tweets_list = tweets_list

    # an example function
    def find_statuses_count(self)->list:
        statuses_count = [d['user']['statuses_count'] for d in self.tweets_list]
        return statuses_count 
        
    def find_full_text(self)->list:
        text = [d['full_text'] for d in self.tweets_list]
        return text
    
    def find_sentiments(self, text)->list:
        polarity=[TextBlob(line).sentiment[0] for line in text]
        subjectivity=[TextBlob(line).sentiment[1] for line in text]
        return polarity, subjectivity

    def find_created_time(self)->list:
        created_at = [d['created_at'] for d in self.tweets_list]
        return created_at

    def find_source(self)->list:
        source = [d['source'] for d in self.tweets_list]

        return source

    def find_screen_name(self)->list:
        screen_name = [d['user']['screen_name'] for d in self.tweets_list]
        return screen_name
    def find_followers_count(self)->list:
        followers_count = [d['user']['followers_count'] for d in self.tweets_list]
        return followers_count
    def find_friends_count(self)->list:
        friends_count = [d['user']['friends_count'] for d in self.tweets_list]
        return friends_count
    def is_sensitive(self)->list:
        arr=[]
        for d in self.tweets_list:
            if 'retweeted_status' in d.keys():
                if 'possibly_sensitive' in d['retweeted_status'].keys():
                    arr.append(d['retweeted_status']['possibly_sensitive'])
                else :
                    arr.append('')
            else :
                arr.append('')
        return arr

    def find_favourite_count(self)->list:
        favourite_count = [d['user']['favourites_count'] for d in self.tweets_list]
        return favourite_count
    def find_retweet_count(self)->list:
        retweet_count = [d.get('retweet_count') for d in self.tweets_list ]
        return retweet_count
    def find_hashtags(self)->list:
        arr=[]
        str=''
        for d in self.tweets_list:
            if len(d['entities']['hashtags']) > 0 :
                for i in range(len(d['entities']['hashtags'])):
                    str=str+', '+d['entities']['hashtags'][i]['text']
                arr.append(str)
            else :
                arr.append('')
        return arr

    def find_mentions(self)->list:
        mentions = [d['entities']['user_mentions'] for d in self.tweets_list]
        return mentions
    def find_location(self)->list:
        try:
            location = [d['user']['location'] for d in self.tweets_list]
        except TypeError:
            location = ''
        
        return location
    def find_lang(self)->list:
        lang = [d['lang'] for d in self.tweets_list]
        return lang
    
        
        
    def get_tweet_df(self, save=False)->pd.DataFrame:
        """required column to be generated you should be creative and add more features"""
        
        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv('processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        
        return df

                
if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
    'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
    _, tweet_list = read_json("me.json")
    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df(True) 

    # use all defined functions to generate a dataframe with the specified columns above

File Successfully Saved.!!!


In [3]:
tweet_df

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,en,15760,2,i_ameztoy,20497,2621,False,", City","[{'screen_name': 'i_ameztoy', 'name': 'Iban Am...",
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,en,6967,201,ZIisq,65,272,False,", City, China, Taiwan","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",
2,Sun Aug 07 22:31:07 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...",0.0,0.0,en,2166,0,Fin21Free,85,392,,", City, China, Taiwan, XiJinping","[{'screen_name': 'ZelenskyyUa', 'name': 'Волод...",Netherlands
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,en,2166,0,Fin21Free,85,392,,", City, China, Taiwan, XiJinping, XiJinping",[],Netherlands
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,en,17247,381,VizziniDolores,910,2608,False,,"[{'screen_name': 'ChinaUncensored', 'name': 'C...","Ayent, Schweiz"


In [9]:
import json
import pandas as pd
from textblob import TextBlob


def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))
    
    
    return len(tweets_data), tweets_data

class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        
        self.tweets_list = tweets_list

    # an example function
    def find_statuses_count(self)->list:
        statuses_count = [d['user']['statuses_count'] for d in self.tweets_list]
        return statuses_count 
        
    def find_full_text(self)->list:
        text = [d['full_text'] for d in self.tweets_list]
        return text
    
    def find_sentiments(self, text)->list:
        polarity=[TextBlob(line).sentiment[0] for line in text]
        subjectivity=[TextBlob(line).sentiment[1] for line in text]
        return polarity, subjectivity

    def find_created_time(self)->list:
        created_at = [d['created_at'] for d in self.tweets_list]
        return created_at

    def find_source(self)->list:
        source = [d['source'] for d in self.tweets_list]

        return source

    def find_screen_name(self)->list:
        screen_name = [d['user']['screen_name'] for d in self.tweets_list]
        return screen_name
    def find_followers_count(self)->list:
        followers_count = [d['user']['followers_count'] for d in self.tweets_list]
        return followers_count
    def find_friends_count(self)->list:
        friends_count = [d['user']['friends_count'] for d in self.tweets_list]
        return friends_count
    def is_sensitive(self)->list:
        arr=[]
        for d in self.tweets_list:
            if 'retweeted_status' in d.keys():
                if 'possibly_sensitive' in d['retweeted_status'].keys():
                    arr.append(d['retweeted_status']['possibly_sensitive'])
                else :
                    arr.append('')
            else :
                arr.append('')
        return arr

    def find_favourite_count(self)->list:
        favourite_count = [d['user']['favourites_count'] for d in self.tweets_list]
        return favourite_count
    def find_retweet_count(self)->list:
        retweet_count = [d.get('retweet_count') for d in self.tweets_list ]
        return retweet_count
    def find_hashtags(self)->list:
        hashtags = [d['entities']['hashtags'] for d in self.tweets_list]
        return hashtags

    def find_mentions(self)->list:
        mentions = [d['entities']['user_mentions'] for d in self.tweets_list]
        return mentions
    def find_location(self)->list:
        try:
            location = [d['user']['location'] for d in self.tweets_list]
        except TypeError:
            location = ''
        
        return location
    def find_lang(self)->list:
        lang = [d['lang'] for d in self.tweets_list]
        return lang
    
        
        
    def get_tweet_df(self, save=False)->pd.DataFrame:
        """required column to be generated you should be creative and add more features"""
        
        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv('processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        
        return df

                
if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
    'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
    _, tweet_list = read_json("me.json")
    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df(True) 

    # use all defined functions to generate a dataframe with the specified columns above

File Successfully Saved.!!!


In [10]:
tweet_df

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,en,15760,2,i_ameztoy,20497,2621,False,"[{'text': 'City', 'indices': [132, 137]}]","[{'screen_name': 'i_ameztoy', 'name': 'Iban Am...",
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,en,6967,201,ZIisq,65,272,False,"[{'text': 'China', 'indices': [18, 24]}, {'tex...","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",
2,Sun Aug 07 22:31:07 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...",0.0,0.0,en,2166,0,Fin21Free,85,392,,"[{'text': 'XiJinping', 'indices': [127, 137]}]","[{'screen_name': 'ZelenskyyUa', 'name': 'Волод...",Netherlands
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,en,2166,0,Fin21Free,85,392,,"[{'text': 'XiJinping', 'indices': [9, 19]}]",[],Netherlands
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,en,17247,381,VizziniDolores,910,2608,False,[],"[{'screen_name': 'ChinaUncensored', 'name': 'C...","Ayent, Schweiz"


In [6]:
tweet_df.drop_duplicates()

TypeError: unhashable type: 'list'

In [77]:
import json
import pandas as pd
from textblob import TextBlob


def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))
    
    
    return len(tweets_data), tweets_data

class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        
        self.tweets_list = tweets_list

    # an example function
    def find_statuses_count(self)->list:
        statuses_count = [''+d['user']['statuses_count'] for d in self.tweets_list]
        return statuses_count 
        
    def find_full_text(self)->list:
        text = [d['full_text'] for d in self.tweets_list]
        return text
    
    def find_sentiments(self, text)->list:
        polarity=[TextBlob(line).sentiment[0] for line in text]
        subjectivity=[TextBlob(line).sentiment[1] for line in text]
        return polarity, subjectivity

    def find_created_time(self)->list:
        created_at = [d['created_at'] for d in self.tweets_list]
        return created_at

    def find_source(self)->list:
        source = [d['source'] for d in self.tweets_list]

        return source

    def find_screen_name(self)->list:
        screen_name = [d['user']['screen_name'] for d in self.tweets_list]
        return screen_name
    def find_followers_count(self)->list:
        followers_count = [d['user']['followers_count'] for d in self.tweets_list]
        return followers_count
    def find_friends_count(self)->list:
        friends_count = [d['user']['friends_count'] for d in self.tweets_list]
        return friends_count
    def is_sensitive(self)->list:
        arr=[]
        for d in self.tweets_list:
            if 'retweeted_status' in d.keys():
                if 'possibly_sensitive' in d['retweeted_status'].keys():
                    arr.append(d['retweeted_status']['possibly_sensitive'])
                else :
                    arr.append('')
            else :
                arr.append('')
        return arr

    def find_favourite_count(self)->list:
        favourite_count = [d['user']['favourites_count'] for d in self.tweets_list]
        return favourite_count
    def find_retweet_count(self)->list:
        retweet_count = [d.get('retweet_count') for d in self.tweets_list ]
        return retweet_count
    def find_hashtags(self)->list:
        hashtags = [str(d['entities']['hashtags'])[1:-1] for d in self.tweets_list]
        return hashtags

    def find_mentions(self)->list:
        mentions = [str(d['entities']['user_mentions'])[1:-1] for d in self.tweets_list]
        return mentions
    def find_location(self)->list:
        try:
            location = [d['user']['location'] for d in self.tweets_list]
        except TypeError:
            location = ''
        
        return location
    def find_lang(self)->list:
        lang = [d['lang'] for d in self.tweets_list]
        return lang
    
        
        
    def get_tweet_df(self, save=False)->pd.DataFrame:
        """required column to be generated you should be creative and add more features"""
        
        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv('processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        
        return df

                
if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
    'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
    _, tweet_list = read_json("me.json")
    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df(True) 

    # use all defined functions to generate a dataframe with the specified columns above

File Successfully Saved.!!!


In [78]:
tweet_df

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,en,15760,2,i_ameztoy,20497,2621,False,"{'text': 'City', 'indices': [132, 137]}","{'screen_name': 'i_ameztoy', 'name': 'Iban Ame...",
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,en,6967,201,ZIisq,65,272,False,"{'text': 'China', 'indices': [18, 24]}, {'text...","{'screen_name': 'IndoPac_Info', 'name': 'Indo-...",
2,Sun Aug 07 22:31:07 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...",0.0,0.0,en,2166,0,Fin21Free,85,392,,"{'text': 'XiJinping', 'indices': [127, 137]}","{'screen_name': 'ZelenskyyUa', 'name': 'Володи...",Netherlands
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,en,2166,0,Fin21Free,85,392,,"{'text': 'XiJinping', 'indices': [9, 19]}",,Netherlands
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,en,17247,381,VizziniDolores,910,2608,False,,"{'screen_name': 'ChinaUncensored', 'name': 'Ch...","Ayent, Schweiz"


In [79]:
tweet_df.drop_duplicates()


Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,en,15760,2,i_ameztoy,20497,2621,False,"{'text': 'City', 'indices': [132, 137]}","{'screen_name': 'i_ameztoy', 'name': 'Iban Ame...",
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,en,6967,201,ZIisq,65,272,False,"{'text': 'China', 'indices': [18, 24]}, {'text...","{'screen_name': 'IndoPac_Info', 'name': 'Indo-...",
2,Sun Aug 07 22:31:07 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...",0.0,0.0,en,2166,0,Fin21Free,85,392,,"{'text': 'XiJinping', 'indices': [127, 137]}","{'screen_name': 'ZelenskyyUa', 'name': 'Володи...",Netherlands
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,en,2166,0,Fin21Free,85,392,,"{'text': 'XiJinping', 'indices': [9, 19]}",,Netherlands
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,en,17247,381,VizziniDolores,910,2608,False,,"{'screen_name': 'ChinaUncensored', 'name': 'Ch...","Ayent, Schweiz"


In [80]:
tweet_df['created_at']

0    Sun Aug 07 22:31:20 +0000 2022
1    Sun Aug 07 22:31:16 +0000 2022
2    Sun Aug 07 22:31:07 +0000 2022
3    Sun Aug 07 22:31:06 +0000 2022
4    Sun Aug 07 22:31:04 +0000 2022
Name: created_at, dtype: object

In [81]:
mystr='Sun Aug 07 22:31:20 +0025 2022'
from datetime import datetime
datetime_object =datetime.strptime(mystr, '%a %b %d %H:%M:%S +%f %Y')
print(datetime_object)  # printed in default format


2022-08-07 22:31:20.002500


In [82]:
for i in range(len(tweet_df['created_at'])):
            from datetime import datetime
            datetime_object =datetime.strptime(tweet_df['created_at'][i], '%a %b %d %H:%M:%S +%f %Y')
            tweet_df['created_at'][i]=str(datetime_object) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweet_df['created_at'][i]=str(datetime_object)


In [83]:
tweet_df['created_at']

0    2022-08-07 22:31:20
1    2022-08-07 22:31:16
2    2022-08-07 22:31:07
3    2022-08-07 22:31:06
4    2022-08-07 22:31:04
Name: created_at, dtype: object

In [84]:
tweet_df['polarity'] = pd.to_numeric(tweet_df['polarity'])

In [85]:
tweet_df['lang'][3]='fr'
tweet_df['lang']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweet_df['lang'][3]='fr'


0    en
1    en
2    en
3    fr
4    en
Name: lang, dtype: object

In [88]:
tweet_df.drop(tweet_df[tweet_df.lang != 'en'].index,inplace=True)

In [89]:
tweet_df.lang

0    en
1    en
2    en
4    en
Name: lang, dtype: object

In [98]:
import json
import pandas as pd
from textblob import TextBlob
from clean_tweets_dataframe import Clean_Tweets

def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))
    
    
    return len(tweets_data), tweets_data

class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        
        self.tweets_list = tweets_list

    # an example function
    def find_statuses_count(self)->list:
        statuses_count = [''+d['user']['statuses_count'] for d in self.tweets_list]
        return statuses_count 
        
    def find_full_text(self)->list:
        text = [d['full_text'] for d in self.tweets_list]
        return text
    
    def find_sentiments(self, text)->list:
        polarity=[TextBlob(line).sentiment[0] for line in text]
        subjectivity=[TextBlob(line).sentiment[1] for line in text]
        return polarity, subjectivity

    def find_created_time(self)->list:
        created_at = [d['created_at'] for d in self.tweets_list]
        return created_at

    def find_source(self)->list:
        source = [d['source'] for d in self.tweets_list]

        return source

    def find_screen_name(self)->list:
        screen_name = [d['user']['screen_name'] for d in self.tweets_list]
        return screen_name
    def find_followers_count(self)->list:
        followers_count = [d['user']['followers_count'] for d in self.tweets_list]
        return followers_count
    def find_friends_count(self)->list:
        friends_count = [d['user']['friends_count'] for d in self.tweets_list]
        return friends_count
    def is_sensitive(self)->list:
        arr=[]
        for d in self.tweets_list:
            if 'retweeted_status' in d.keys():
                if 'possibly_sensitive' in d['retweeted_status'].keys():
                    arr.append(d['retweeted_status']['possibly_sensitive'])
                else :
                    arr.append('')
            else :
                arr.append('')
        return arr

    def find_favourite_count(self)->list:
        favourite_count = [d['user']['favourites_count'] for d in self.tweets_list]
        return favourite_count
    def find_retweet_count(self)->list:
        retweet_count = [d.get('retweet_count') for d in self.tweets_list ]
        return retweet_count
    def find_hashtags(self)->list:
        hashtags = [str(d['entities']['hashtags'])[1:-1] for d in self.tweets_list]
        return hashtags

    def find_mentions(self)->list:
        mentions = [str(d['entities']['user_mentions'])[1:-1] for d in self.tweets_list]
        return mentions
    def find_location(self)->list:
        try:
            location = [d['user']['location'] for d in self.tweets_list]
        except TypeError:
            location = ''
        
        return location
    def find_lang(self)->list:
        lang = [d['lang'] for d in self.tweets_list]
        return lang
    
        
        
    def get_tweet_df(self, save=False)->pd.DataFrame:
        """required column to be generated you should be creative and add more features"""
        
        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv('processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        
        return df

                
if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
    'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
    _, tweet_list = read_json("me.json")
    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df(True) 
    clean_tweet = Clean_Tweets(tweet_df)
    clean_tweet_df = clean_tweet.get_clean_tweet_df()
    # use all defined functions to generate a dataframe with the specified columns above

File Successfully Saved.!!!
Automation in Action...!!!


AttributeError: 'Clean_Tweets' object has no attribute 'get_clean_tweet_df'

In [99]:
 clean_tweet_df.df

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,en,15760,2,i_ameztoy,20497,2621,False,"{'text': 'City', 'indices': [132, 137]}","{'screen_name': 'i_ameztoy', 'name': 'Iban Ame...",
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,en,6967,201,ZIisq,65,272,False,"{'text': 'China', 'indices': [18, 24]}, {'text...","{'screen_name': 'IndoPac_Info', 'name': 'Indo-...",
2,Sun Aug 07 22:31:07 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...",0.0,0.0,en,2166,0,Fin21Free,85,392,,"{'text': 'XiJinping', 'indices': [127, 137]}","{'screen_name': 'ZelenskyyUa', 'name': 'Володи...",Netherlands
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,en,2166,0,Fin21Free,85,392,,"{'text': 'XiJinping', 'indices': [9, 19]}",,Netherlands
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,en,17247,381,VizziniDolores,910,2608,False,,"{'screen_name': 'ChinaUncensored', 'name': 'Ch...","Ayent, Schweiz"


In [116]:
import json
import pandas as pd
from textblob import TextBlob
class Clean_Tweets:
    """
    The PEP8 Standard AMAZING!!!
    """
    def __init__(self, df:pd.DataFrame):
        self.df = df
        print('Automation in Action...!!!')
        
    def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame:
        """
        remove rows that has column names. This error originated from
        the data collection stage.  
        """
        unwanted_rows = df[df['retweet_count'] == 'retweet_count' ].index
        df.drop(unwanted_rows , inplace=True)
        df = df[df['polarity'] != 'polarity']
        return df
    def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame:
        """
        drop duplicate rows
        """
        return df.drop_duplicates()
    def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame:
        """
        convert column to datetime
        """ 
        df = df[df['created_at'] >= '2020-12-31' ]
        for i in range(len(df['created_at'])):
            from datetime import datetime
            datetime_object =datetime.strptime(df['created_at'][i], '%a %b %d %H:%M:%S +%f %Y')
            df['created_at'][i]=str(datetime_object) 
        return df
    
    def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame:
        """
        convert columns like polarity, subjectivity, retweet_count
        favorite_count etc to numbers
        """
        df['polarity'] = pd.to_numeric(df['polarity'])
        df['subjectivity'] = pd.to_numeric(df['subjectivity'])
        df['retweet_count'] = pd.to_numeric(df['retweet_count'])
        df['favorite_count'] = pd.to_numeric(df['favorite_count'])
        return df
    
    def remove_non_english_tweets(self, df:pd.DataFrame)->pd.DataFrame:
        """
        remove non english tweets from lang
        """
        
        df = df.drop(df[df.lang != 'en'].index,inplace=True)
        return df
    def get_clean_tweet_df(self, save=False)->pd.DataFrame:
        clean_tweet_df = self.drop_unwanted_column(self.df)
        clean_tweet_df = self.drop_duplicate(clean_tweet_df)
        clean_tweet_df = self.convert_to_datetime(clean_tweet_df)
        clean_tweet_df = self.convert_to_numbers(clean_tweet_df)
        print(clean_tweet_df)
        #clean_tweet_df = self.remove_non_english_tweets(clean_tweet_df)
        if save:
            clean_tweet_df.to_csv('processed_clean_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        return clean_tweet_df





In [117]:
import json
import pandas as pd
from textblob import TextBlob


def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))
    
    
    return len(tweets_data), tweets_data

class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        
        self.tweets_list = tweets_list

    # an example function
    def find_statuses_count(self)->list:
        statuses_count = [''+d['user']['statuses_count'] for d in self.tweets_list]
        return statuses_count 
        
    def find_full_text(self)->list:
        text = [d['full_text'] for d in self.tweets_list]
        return text
    
    def find_sentiments(self, text)->list:
        polarity=[TextBlob(line).sentiment[0] for line in text]
        subjectivity=[TextBlob(line).sentiment[1] for line in text]
        return polarity, subjectivity

    def find_created_time(self)->list:
        created_at = [d['created_at'] for d in self.tweets_list]
        return created_at

    def find_source(self)->list:
        source = [d['source'] for d in self.tweets_list]

        return source

    def find_screen_name(self)->list:
        screen_name = [d['user']['screen_name'] for d in self.tweets_list]
        return screen_name
    def find_followers_count(self)->list:
        followers_count = [d['user']['followers_count'] for d in self.tweets_list]
        return followers_count
    def find_friends_count(self)->list:
        friends_count = [d['user']['friends_count'] for d in self.tweets_list]
        return friends_count
    def is_sensitive(self)->list:
        arr=[]
        for d in self.tweets_list:
            if 'retweeted_status' in d.keys():
                if 'possibly_sensitive' in d['retweeted_status'].keys():
                    arr.append(d['retweeted_status']['possibly_sensitive'])
                else :
                    arr.append('')
            else :
                arr.append('')
        return arr

    def find_favourite_count(self)->list:
        favourite_count = [d['user']['favourites_count'] for d in self.tweets_list]
        return favourite_count
    def find_retweet_count(self)->list:
        retweet_count = [d.get('retweet_count') for d in self.tweets_list ]
        return retweet_count
    def find_hashtags(self)->list:
        hashtags = [str(d['entities']['hashtags'])[1:-1] for d in self.tweets_list]
        return hashtags

    def find_mentions(self)->list:
        mentions = [str(d['entities']['user_mentions'])[1:-1] for d in self.tweets_list]
        return mentions
    def find_location(self)->list:
        try:
            location = [d['user']['location'] for d in self.tweets_list]
        except TypeError:
            location = ''
        
        return location
    def find_lang(self)->list:
        lang = [d['lang'] for d in self.tweets_list]
        return lang
    
        
        
    def get_tweet_df(self, save=False)->pd.DataFrame:
        """required column to be generated you should be creative and add more features"""
        
        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        data = zip(created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv('processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        
        return df

                
if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
    'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
    _, tweet_list = read_json("me.json")
    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df(True) 
    clean_tweet = Clean_Tweets(tweet_df)
    clean_tweet_df = clean_tweet.get_clean_tweet_df()
    # use all defined functions to generate a dataframe with the specified columns above

File Successfully Saved.!!!
Automation in Action...!!!
            created_at                                             source  \
0  2022-08-07 22:31:20  <a href="http://twitter.com/download/android" ...   
1  2022-08-07 22:31:16  <a href="http://twitter.com/download/android" ...   
2  2022-08-07 22:31:07  <a href="http://twitter.com/download/android" ...   
3  2022-08-07 22:31:06  <a href="http://twitter.com/download/android" ...   
4  2022-08-07 22:31:04  <a href="http://twitter.com/download/iphone" r...   

                                       original_text      polarity  \
0  RT @i_ameztoy: Extra random image (I):\n\nLets... -1.250000e-01   
1  RT @IndoPac_Info: #China's media explains the ... -1.000000e-01   
2  China even cut off communication, they don't a...  0.000000e+00   
3  Putin to #XiJinping : I told you my friend, Ta...  1.000000e-01   
4  RT @ChinaUncensored: I’m sorry, I thought Taiw... -6.938894e-18   

   subjectivity lang  favorite_count  retweet_count original_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['created_at'][i]=str(datetime_object)


In [118]:
clean_tweet_df
