In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, StringIndexer
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/unamed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/unamed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Création de la session Spark
spark = SparkSession.builder.appName("Twitter Sentiment Analysis").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/07 10:56:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/07 10:56:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60554)
Traceback (most recent call last):
  File "/usr/lib/python3.12/socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.12/socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.12/socketserver.py", line 362, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.12/socketserver.py", line 761, in __init__
    self.handle()
 

In [4]:
# Définir le schéma pour le fichier CSV
schema = StructType([
    StructField("Tweet ID", IntegerType(), True),
    StructField("Entity", StringType(), True),
    StructField("Sentiment", StringType(), True),
    StructField("Tweet content", StringType(), True)
])

In [5]:
# Chargement des données depuis le fichier CSV avec le schéma spécifié
df = spark.read.option("header", "true").schema(schema).csv("twitter_training.csv")

In [6]:
# Filtrage des lignes contenant des valeurs nulles dans la colonne "Tweet content"
df = df.filter(df["Tweet content"].isNotNull())

In [7]:
# Affichage du schéma pour vérifier le nom des colonnes
df.printSchema()

root
 |-- Tweet ID: integer (nullable = true)
 |-- Entity: string (nullable = true)
 |-- Sentiment: string (nullable = true)
 |-- Tweet content: string (nullable = true)



In [8]:
# Définition des étapes de prétraitement et du modèle dans la pipeline
tokenizer = Tokenizer(inputCol="Tweet content", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered_words")
countVectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features")
labelIndexer = StringIndexer(inputCol="Sentiment", outputCol="label")
svm = LinearSVC(maxIter=10)
ovr = OneVsRest(classifier=svm)

In [9]:
# Création de la pipeline
pipeline = Pipeline(stages=[tokenizer, remover, countVectorizer, labelIndexer, ovr])

In [10]:
# Division des données en ensembles d'entraînement et de test
(training_data, test_data) = df.randomSplit([0.8, 0.2], seed=42)


In [11]:
# Entraînement de la pipeline
pipeline_model = pipeline.fit(training_data)

24/05/07 10:56:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2401, Borderlands, Positive, im getting on borderlands and i will murder you all ,
 Schema: Tweet ID, Entity, Sentiment, Tweet content
Expected: Tweet ID but found: 2401
CSV file: file:///home/unamed/Projects/MST/bigData/twitter_training.csv
24/05/07 10:56:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2401, Borderlands, Positive, im getting on borderlands and i will murder you all ,
 Schema: Tweet ID, Entity, Sentiment, Tweet content
Expected: Tweet ID but found: 2401
CSV file: file:///home/unamed/Projects/MST/bigData/twitter_training.csv
24/05/07 10:56:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2401, Borderlands, Positive, im getting on borderlands and i will murder you all ,
 Schema: Tweet ID, Entity, Sentiment, Tweet content
Expected: Tweet ID but found: 2401
CSV file: file:///home/unamed/Projects/MST/bigData/twitter_training

In [12]:
# Prédiction sur les données de test
predictions = pipeline_model.transform(test_data)

In [13]:
# Évaluation du modèle
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of SVM model:", accuracy)

24/05/07 10:57:06 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
24/05/07 10:57:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2401, Borderlands, Positive, im getting on borderlands and i will murder you all ,
 Schema: Tweet ID, Entity, Sentiment, Tweet content
Expected: Tweet ID but found: 2401
CSV file: file:///home/unamed/Projects/MST/bigData/twitter_training.csv

Accuracy of SVM model: 0.8786407766990292


                                                                                

In [14]:
print(predictions.columns)

['Tweet ID', 'Entity', 'Sentiment', 'Tweet content', 'words', 'filtered_words', 'features', 'label', 'rawPrediction', 'prediction']
