# Projeto 3 - Análise de Sentimentos
Data Science Academy

In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.11.4


In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.types import BooleanType, IntegerType

In [3]:
from langdetect import detect
from textblob import TextBlob

ModuleNotFoundError: No module named 'langdetect'

In [19]:
spark = SparkSession.builder.appName("AnaliseSentimento").getOrCreate()

In [20]:
# Carregue o CSV como um DataFrame
df = spark.read.csv("chatgpt1.csv", header=True, inferSchema=True)

In [21]:
df.show(5)

+----------------------------------------+-------------------+---------------------------+--------+---------+----+--------+----------+----------+------------+---------+----------+--------------+--------+------+-----+-----------+--------------+-------+-------------+
|                                Datetime|           Tweet Id|                       Text|Username|Permalink|User|Outlinks|CountLinks|ReplyCount|RetweetCount|LikeCount|QuoteCount|ConversationId|Language|Source|Media|QuotedTweet|MentionedUsers|hashtag|hastag_counts|
+----------------------------------------+-------------------+---------------------------+--------+---------+----+--------+----------+----------+------------+---------+----------+--------------+--------+------+-----+-----------+--------------+-------+-------------+
|                    2023-01-22 13:44:...|1617156270871699456|ChatGPTで遊ぶの忘れてた！！|    NULL|     NULL|NULL|    NULL|      NULL|      NULL|        NULL|     NULL|      NULL|          NULL|    NULL|  NULL| N

In [22]:
df.count()

82129

In [23]:
df.columns

['Datetime',
 'Tweet Id',
 'Text',
 'Username',
 'Permalink',
 'User',
 'Outlinks',
 'CountLinks',
 'ReplyCount',
 'RetweetCount',
 'LikeCount',
 'QuoteCount',
 'ConversationId',
 'Language',
 'Source',
 'Media',
 'QuotedTweet',
 'MentionedUsers',
 'hashtag',
 'hastag_counts']

In [24]:
df = df.select("Text")

In [25]:
df.show(5)

+---------------------------+
|                       Text|
+---------------------------+
|ChatGPTで遊ぶの忘れてた！！|
|                       NULL|
|                       NULL|
|                       NULL|
|                       NULL|
+---------------------------+
only showing top 5 rows



In [26]:
df = df.dropna()

In [27]:
df.show(5)

+---------------------------+
|                       Text|
+---------------------------+
|ChatGPTで遊ぶの忘れてた！！|
|       https://twitter.c...|
|       @AlexandrovnaIng ...|
|       Schaut Euch an, w...|
|       https://twitter.c...|
+---------------------------+
only showing top 5 rows



In [28]:
def is_english(text):
    try:
        lang = detect(text)
        print(lang)
        return lang=='en'
    except:
        return False

udf_is_english = udf(is_english, BooleanType())

In [29]:
df = df.withColumn("is_english", udf_is_english(df.Text))

In [30]:
df.show()

+----------------------------+----------+
|                        Text|is_english|
+----------------------------+----------+
| ChatGPTで遊ぶの忘れてた！！|     false|
|        https://twitter.c...|     false|
|        @AlexandrovnaIng ...|      true|
|        Schaut Euch an, w...|     false|
|        https://twitter.c...|     false|
|        Bow down to chatG...|      true|
|        Profilinde vatan,...|     false|
|        ChatGPT’nin bilin...|     false|
|        https://twitter.c...|     false|
|        ChatGPT runs 10K ...|      true|
|        @SWENGDAD There i...|      true|
|        I created a ficti...|      true|
|        https://twitter.c...|     false|
|        It will look simi...|      true|
|        If you ever playe...|      true|
|        https://twitter.c...|     false|
|        AI will initiate ...|      true|
|        https://twitter.c...|     false|
|最新コメント15件（01/22 2...|     false|
|        https://twitter.c...|     false|
+----------------------------+----------+
only showi

In [34]:
df = df.where(df.is_english==True)

In [35]:
from textblob import TextBlob

def analisar_sentimento(tweet):
    blob = TextBlob(tweet)
    polaridade = blob.sentiment.polarity
    if polaridade > 0:
        return 1
    elif polaridade < 0:
        return -1
    return 0



In [37]:
udf_analisar_sentimento = udf(analisar_sentimento, IntegerType())

In [38]:
df = df.withColumn('label', udf_analisar_sentimento(df.Text))

In [39]:
df.show()

+--------------------+----------+-----+
|                Text|is_english|label|
+--------------------+----------+-----+
|@AlexandrovnaIng ...|      true|    0|
|Bow down to chatG...|      true|   -1|
|ChatGPT runs 10K ...|      true|    1|
|@SWENGDAD There i...|      true|   -1|
|I created a ficti...|      true|    0|
|It will look simi...|      true|    1|
|If you ever playe...|      true|   -1|
|AI will initiate ...|      true|    1|
|🤯 #ChatGPT write...|      true|    0|
|Pay $42 per month...|      true|    0|
|Your own AI knowl...|      true|   -1|
|ChatGPT | write m...|      true|    0|
|@Slipcatch @nickh...|      true|    0|
|@paulgp Leverage ...|      true|    1|
|@FrankDeya The an...|      true|   -1|
|          chatgpt ☺️|      true|    0|
|#Tech: Google is ...|      true|    0|
|There goes the en...|      true|    0|
|Problems of Pakis...|      true|    0|
|"The question is ...|      true|   -1|
+--------------------+----------+-----+
only showing top 20 rows



In [41]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|   -1| 4593|
|    1|12806|
|    0|15986|
+-----+-----+



In [42]:
df.createOrReplaceTempView('tt')

In [61]:
r = spark.sql('SELECT Text FROM tt WHERE label = -1').collect()

In [70]:
r[8][0]

'@ShiLLin_ViLLian ChatGPT is crazy been messing around with it a little 👀'