<a href="https://colab.research.google.com/github/pbeles/Project-3-Sentiment-Analysis-/blob/main/Scraping_reddit_google_appstore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages in Colab
!pip install tweepy praw textblob google-play-scraper app_store_scraper

# Importing the libraries
import tweepy
import praw
import prawcore  # Explicitly import prawcore for exception handling
import pandas as pd
import numpy as np
from datetime import datetime
from textblob import TextBlob
from google_play_scraper import Sort, reviews_all
from app_store_scraper import AppStore
import time  # For handling rate limits

from google.colab import userdata

# Load user credentials from Colab
userdata.get('twitter')
userdata.get('reddit_app')
userdata.get('app_secret')

# Set up Twitter API
def setup_twitter_api():
    try:
        client = tweepy.Client(bearer_token=userdata.get('twitter'))

        # Test the client by making a sample request to verify the credentials
        response = client.get_user(username='TwitterDev')
        if response.data:
            print("Twitter API credentials are working.")
        else:
            print("Twitter API credentials are invalid. Please check your bearer token.")
            return None
        return client
    except tweepy.errors.Unauthorized:
        print("Twitter API Authorization Error: 401 Unauthorized")
        print("Check if the Bearer Token is correct and has sufficient permissions.")
        return None
    except Exception as e:
        print(f"Error setting up Twitter API: {e}")
        return None

# Set up Reddit API
def setup_reddit_api(user_agent='python:tax_review_scrapper:v1.0'):
    try:
        reddit = praw.Reddit(
            client_id=userdata.get('reddit_app'),
            client_secret=userdata.get('app_secret'),
            user_agent=user_agent,
            username=None,
            password=None
        )

        # Test the Reddit API client
        subreddit = reddit.subreddit('python')
        posts = list(subreddit.hot(limit=1))
        if len(posts) > 0:
            print("Reddit API credentials are working.")
        else:
            print("Reddit API credentials are invalid.")
            return None

        return reddit
    except prawcore.exceptions.OAuthException:
        print("Reddit API Authorization Error: 401 Unauthorized")
        print("Check if the Client ID, Client Secret, and User Agent are correct.")
        return None
    except Exception as e:
        print(f"Error setting up Reddit API: {e}")
        return None

# Analyze sentiment of text
def analyze_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0.1:
        return "positive"
    elif analysis.sentiment.polarity < -0.1:
        return "negative"
    else:
        return "neutral"

# Search for Twitter reviews
def search_twitter_reviews(client, tax_software_names, max_results=100):
    reviews = []

    if not client:
        print("Twitter client not initialized, skipping Twitter reviews.")
        return reviews

    for software in tax_software_names:
        # Search for tweets mentioning the tax software
        query = f"{software} (review OR reviews) -is:retweet"
        count = 0

        while count < 450:
            try:
                tweets = client.search_recent_tweets(
                    query=query,
                    max_results=min(max_results, 450 - count),
                    tweet_fields=['created_at', 'public_metrics']
                )

                if tweets.data:
                    for tweet in tweets.data:
                        reviews.append({
                            'platform': 'Twitter',
                            'software': software,
                            'text': tweet.text,
                            'date': tweet.created_at,
                            'likes': tweet.public_metrics['like_count'],
                            'retweets': tweet.public_metrics['retweet_count'],
                            'country': 'Germany'  # Assuming the tax software is from Germany
                        })
                        count += 1
            except Exception as e:
                print(f"Error retrieving tweets for {software}: {e}")
                break

            # Avoid hitting rate limits
            time.sleep(1)

    return reviews

# Search for Reddit reviews
def search_reddit_reviews(reddit, tax_software_names, country, time_filter='year'):
    reviews = []
    relevant_subreddits = ['tax', 'personalfinance', 'software']

    if not reddit:
        print("Reddit client not initialized, skipping Reddit reviews.")
        return reviews

    for software in tax_software_names:
        for subreddit_name in relevant_subreddits:
            subreddit = reddit.subreddit(subreddit_name)

            # Search for posts containing the software name
            for post in subreddit.search(f"{software} review", time_filter=time_filter):
                reviews.append({
                    'platform': 'Reddit',
                    'software': software,
                    'title': post.title,
                    'text': post.selftext,
                    'date': datetime.fromtimestamp(post.created_utc),
                    'score': post.score,
                    'num_comments': post.num_comments,
                    'subreddit': subreddit_name,
                    'country': country
                })

    return reviews

# Search for App Store reviews
def search_app_store_reviews(app_name, country='us', how_many=200):
    app = AppStore(country=country, app_name=app_name)
    app.review(how_many=how_many)
    reviews = pd.DataFrame(np.array(app.reviews), columns=['review'])
    reviews = reviews.join(pd.DataFrame(reviews.pop('review').tolist()))
    reviews['software'] = app_name
    reviews['country'] = country
    reviews['platform'] = 'App Store'
    return reviews

# Main function to run the script
def main():
    # List of tax software applications by country
    germany_tax_apps = [
        "WISO Steuer", "Taxfix", "Steuerbot", "ELSTER", "Tax Fix Pro", "Steuergo",
        "Taxman", "Steuererklärung", "SteuerSparErklärung", "Steuer Web"
    ]
    uk_tax_apps = [
        "SimpleTax", "GoSimpleTax", "HMRC Official App", "FreeAgent", "QuickBooks UK",
        "TaxCalc", "Sage UK", "Tax Calc Pro", "Which? Tax Calculator", "Self Assessment Tax Return"
    ]
    spain_tax_apps = [
        "Declaración de la Renta", "TaxDown", "Contasimple", "Hacienda App", "Mi Agencia Tributaria",
        "Declaracion Renta Web", "Gestoria Online", "IVA Digital", "Declaracion Renta Online", "Mi AEAT"
    ]

    # Initialize API clients
    twitter_client = setup_twitter_api()
    reddit_client = setup_reddit_api()

    # Collect reviews
    twitter_reviews = search_twitter_reviews(twitter_client, germany_tax_apps)
    reddit_reviews_germany = search_reddit_reviews(reddit_client, germany_tax_apps, 'Germany')

    uk_app_store_reviews = []
    for app_name in uk_tax_apps:
        print(f"Collecting App Store reviews for {app_name}...")
        reviews = search_app_store_reviews(app_name, country='gb', how_many=200)
        uk_app_store_reviews.append(reviews)

    spain_app_store_reviews = []
    for app_name in spain_tax_apps:
        print(f"Collecting App Store reviews for {app_name}...")
        reviews = search_app_store_reviews(app_name, country='es', how_many=200)
        spain_app_store_reviews.append(reviews)

    # Combine reviews into a single DataFrame
    all_reviews = pd.concat([
        pd.DataFrame(twitter_reviews),
        pd.DataFrame(reddit_reviews_germany),
        *uk_app_store_reviews,
        *spain_app_store_reviews
    ], ignore_index=True)

    if all_reviews.empty:
        print("No reviews collected.")
    else:
        # Save the reviews to a CSV file
        all_reviews.to_csv('tax_software_reviews.csv', index=False)
        print("Reviews saved to tax_software_reviews.csv")

main()


Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting app_store_scraper
  Downloading app_store_scraper-0.3.5-py3-none-any.whl.metadata (5.1 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
INFO: pip is looking at multiple versions of app-store-scraper to determine which version is compatible with other requirements. This could take a while.
Collecting app_store_scraper
  Downloading app_store_scraper-0.3.4-py3-none-any.whl.metadata (5.0 kB)
  Downloading app_store_scraper-0.3.3-py3-none-any.whl.metadata (5.0 kB)
  Downloading app_store_scraper-0.3.0-py3-n

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Reddit API credentials are working.
Twitter client not initialized, skipping Twitter reviews.


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Collecting App Store reviews for SimpleTax...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/gb/apps/1659997701/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for GoSimpleTax...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/gb/apps/1151450846/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for HMRC Official App...
Collecting App Store reviews for FreeAgent...
Collecting App Store reviews for QuickBooks UK...
Collecting App Store reviews for TaxCalc...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/gb/apps/6450438064/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for Sage UK...
Collecting App Store reviews for Tax Calc Pro...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/gb/apps/574699152/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for Which? Tax Calculator...
Collecting App Store reviews for Self Assessment Tax Return...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/gb/apps/940247939/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for Declaración de la Renta...
Collecting App Store reviews for TaxDown...
Collecting App Store reviews for Contasimple...
Collecting App Store reviews for Hacienda App...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/es/apps/978020031/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for Mi Agencia Tributaria...
Collecting App Store reviews for Declaracion Renta Web...
Collecting App Store reviews for Gestoria Online...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/es/apps/6657984254/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for IVA Digital...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/es/apps/1255843853/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for Declaracion Renta Online...


ERROR:Base:Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/es/apps/1204339670/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 404 error responses'))


Collecting App Store reviews for Mi AEAT...
Reviews saved to tax_software_reviews.csv
